Here, I analyse and document my progress with the analysis of a world-wide whole genome sampling of Zymoseptoria tritici. Some of the analysis are just exploratory while some other are lined up in a clear path to answering questions related to the history of Z.tritici and to better understand its adaptation to various climates.

library(knitr)
library(reticulate)

#Spatial analyses packages
library(raster)
library(sp)
library(rgdal)
library(maps)
library(geosphere)

#Data wrangling and data viz
library(purrr)
library(RColorBrewer)
library(plotly)
library(cowplot)
library(GGally)
library(corrplot)
library(pheatmap)
library(ggstance)
library('pophelper')
library(ggbiplot)
library(igraph)
library(ggraph)
library(ggtext)
library(scatterpie)
library(tidyverse)

#Pop structure and pop genomic
library(GenomicFeatures)
library(SNPRelate) #PCA
library(LEA) #Clustering
library(pophelper)
#library(PopGenome) #Summary statistics
library(gridExtra)
library(ggExtra)
library(ggtree)
library(tidytree)
library(multcomp)
library(lsmeans)

#GEA
library(lfmm)

#Statistics
library(car)
library(corrr)
library(lsmeans)
library(multcomp)

library(caret)
library(mgcv)

#Variables
world <- map_data("world")
project_dir="~/Documents/Postdoc_Bruce/Projects/WW_project/"
lists_dir = "~/Documents/Postdoc_Bruce/Projects/WW_project/WW_PopGen/Keep_lists_samples/"

#Data directories
data_dir=paste0(project_dir, "0_Data/")
metadata_dir=paste0(project_dir, "Metadata/")
fig_dir = "~/Documents/Postdoc_Bruce/Manuscripts/Feurtey_WW_Zt/Draft_figures/"

#Analysis directories
#-___________________
VAR_dir = paste0(project_dir, "1_Variant_calling/")
  depth_per_window_dir = paste0(VAR_dir, "1_Depth_per_window/")
  vcf_dir = paste0(VAR_dir, "4_Joint_calling/")
  mito_SV = paste0(VAR_dir, "6_Mito_SV/")
PopStr_dir = paste0(project_dir, "2_Population_structure/")
  nuc_PS_dir=paste0(PopStr_dir, "0_Nuclear_genome/")
  mito_PS_dir = paste0(PopStr_dir, "1_Mitochondrial_genome/")
Sumstats_dir = paste0(project_dir, "3_Sumstats_demography/")
TE_RIP_dir=paste0(project_dir, "4_TE_RIP/")
   RIP_DIR=paste0(TE_RIP_dir, "0_RIP_estimation/")
   DIM2_DIR=paste0(TE_RIP_dir, "1_Blast_from_denovo_assemblies/")
GEA_dir=paste0(project_dir, "5_GEA/")
fung_dir=paste0(project_dir, "6_Fungicide_resistance/")
virulence_dir = paste0(project_dir, "7_Virulence/")
sel_dir = paste0(project_dir, "8_Selection/")
  gene_list_dir = paste0(sel_dir, "0_Lists_unique_copy/")

#Files
vcf_name="Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp"
vcf_name_nomaf="Ztritici_global_March2021.filtered-clean.AB_filtered.SNP.max-m-0.8.thin-1000bp"
vcf_name_mito = "Ztritici_global_March2021.genotyped.mt.filtered.clean.AB_filtered.variants.good_samples.max-m-80"
Zt_list = paste0(lists_dir, "Ztritici_global_March2021.genotyped.good_samples.args")
gff_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.Grandaubert2015.no_CDS.gff3")
ref_fasta_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa")
metadata_name = "Main_table_from_SQL_Feb_2020"
gene_annotation = read_tsv(paste0(data_dir, "Badet_GLOBAL_PANGENOME_TABLE.txt"))
complete_mito = read_tsv(paste0(data_dir, "Complete_mitochondria_from_blast.txt"), col_names = c("ID_file", "Contig"))

Sys.setenv(PROJECTDIR=project_dir)
Sys.setenv(VARDIR=VAR_dir)
Sys.setenv(VCFDIR=vcf_dir)
Sys.setenv(POPSTR=PopStr_dir)
Sys.setenv(MITOPOPSTR=mito_PS_dir)

Sys.setenv(SUMST=Sumstats_dir)
Sys.setenv(GEADIR=GEA_dir)

Sys.setenv(ZTLIST=Zt_list)
Sys.setenv(GFFFILE = gff_file)
Sys.setenv(REFFILE = ref_fasta_file)
Sys.setenv(VCFNAME=vcf_name)
Sys.setenv(VCFNAME_NOMAF=vcf_name_nomaf)
Sys.setenv(VCFNAME_MITO=vcf_name_mito)

#knitr::opts_chunk$set(echo = F)
knitr::opts_chunk$set(message = F)
knitr::opts_chunk$set(warning = F)
knitr::opts_chunk$set(results = T)


# Metadata and sample lists
##Filtered_samples
filtered_samples = bind_rows(
  read_tsv(paste0(metadata_dir, "Sample_removed_based_on_IBS.args"), col_names = "ID_file") %>%
  mutate(Filter = "IBS"),
read_tsv(paste0(metadata_dir, "Sample_with_too_much_NA.args"), col_names = "ID_file") %>%
  mutate(Filter = "High_NA"),
read_tsv(paste0(metadata_dir, "Samples_to_filter_out.args"), col_names = "ID_file") %>%
  mutate(Filter = "Mutants_etc"))

##Samples in vcf
genotyped_samples = read_tsv(Zt_list, col_names = "ID_file")

## Metadata of genotyped samples 
temp = read_tsv(paste0(metadata_dir, metadata_name, "_Description.tab"), col_names = F) %>% pull()

Zt_meta = read_delim(paste0(metadata_dir, metadata_name, "_with_collection.tab"), 
                 col_names = temp,
                 na = "\\N", guess_max = 2000) %>%
  unite(Coordinates, Latitude, Longitude, sep = ";", remove = F) %>%
  inner_join(., genotyped_samples)  %>%
  mutate(Country = ifelse(Country == "USA", paste(Country, Region, sep = "_"), Country)) %>%
  mutate(Country = ifelse(Country == "Australia", paste(Country, Region, sep = "_"), Country)) %>%
  mutate(Country = ifelse(Country == "NZ", "New Zealand", Country)) %>%
  mutate(Country = ifelse(Country == "CH", "Switzerland", Country)) %>%
  mutate(Latitude2 = round(Latitude, 2), Longitude2 = round(Longitude, 2)) %>%
  dplyr::select(ID_file, Sampling_Date, Collection,
                Country, Continent, Latitude, Longitude, Latitude2, Longitude2)

#genotyped_samples %>%
#  filter(!(ID_file %in% filtered_samples$ID_file)) %>%
#    write_tsv(Zt_list, col_names = F)



#Define colors
## For continents
#myColors <- c("#04078B", "#a10208", "#FFBA08", "#CC0044", "#5C9D06", "#129EBA","#305D1B")
myColors <- c("#DA4167", "grey", "#ffba0a", "#A20106", "#3F6E0C", "#129eba", "#8fa253" )
names(myColors) = levels(factor(Zt_meta$Continent))
Color_Continent = ggplot2::scale_colour_manual(name = "Continent", values = myColors)
Fill_Continent = ggplot2::scale_fill_manual(name = "Continent", values = myColors)

## For clustering
K_colors = c("#f9c74f", "#f9844a", "#90be6d", "#f5cac3", 
"#83c5be", "#f28482", "#577590", "#e5e5e5", "#a09abc",  "#52796f",
"#219ebc", "#003049", "#82c0cc", "#283618", "white")

## For correlations
mycolorsCorrel<- colorRampPalette(c("#0f8b8d", "white", "#a8201a"))(20)

#Random gradients
greens=c("#1B512D", "#669046", "#8CB053", "#B1CF5F", "#514F59")
blues=c("#08386E", "#1C89C9", "#28A7C0", "#B0DFE8", "grey")
# Uploading commands from the cluster to my computer.

# 1 - Variant calling - Nuclear
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall* ~/Documents/Postdoc_Bruce/Projects/WW_project/1_Variant_calling/4_Joint_calling/
rsync -avP alice@130.125.25.244:/home/alice/WW_PopGen/Keep_lists_samples/Ztritici_global_March2021.genotyped.good_samples.args \
  ../WW_PopGen/Keep_lists_samples/

# 1 - Variant calling - Mito 
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.mt.filtered.clean.AB_filtered.variants.good_samples.max-m-80.recode.vcf.gz  ~/Documents/Postdoc_Bruce/Projects/WW_project/1_Variant_calling/4_Joint_calling/

rsync -avP alice@130.125.25.244:/data2/alice/WW_project/0_Data/4_Mitochondrial_genome/0_Mito_blast/Complete_mitochondria_from_blast.txt ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/

# 1 - Variant calling - Depth 
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.1kb_windows.nuc_GC.tab ../0_Data/
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/1_Depth_per_window/Depth_per*summary.q30.txt  \
   ~/Documents/Postdoc_Bruce/Projects/WW_project/1_Variant_calling/1_Depth_per_window/
# Depth_per_sample_core_chr_summary.q30.txt
# Depth_per_sample_summary.q30.txt
# Depth_per_chromosome_summary.q30.txt
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/2_Depth_per_gene/Depth_per_genes_normalized.txt \
    ../1_Variant_calling/2_Depth_per_gene/
  
# 1 - Variant calling - SV
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/6_Mito_SV/All_complete_mitochondria.* \
  ~/Documents/Postdoc_Bruce/Projects/WW_project/1_Variant_calling/6_Mito_SV/
#All_complete_mitochondria.mcoords
#All_complete_mitochondria.snps

# 2 -Population structure
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/2_Population_structure/1_Mitochondrial_genome/Mitochondria.blast_results.tsv\
    ../2_Population_structure/1_Mitochondrial_genome/


# 4 - TE and RIP
rsync -avP \
  alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Nb_reads* \
  ~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/0_RIP_estimation/
# Nb_reads_per_TE.txt
# Nb_reads.txt
rsync -avP \
  alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Composite_index.txt \
  ~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/0_RIP_estimation/
rsync -avP   alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/1_Blast_dim2_deRIPped/ \
 ~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/
  
  
# 5 - GEA
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/5_GEA/output/* ~/Documents/Postdoc_Bruce/Projects/WW_project/5_GEA/


# 6 - Fungicides
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/6_Fungicide_resistance/genotypes_tidy_format.tab ~/Documents/Postdoc_Bruce/Projects/WW_project/6_Fungicide_resistance/

Sampling


The sampling of Z.tritici isolated from the natural environment covers almost the entirety of the wheat-grown continents. It is, however, highly heterogeous. Europe has the highest sampling density. Several locations are heavily sampled, such a fields in Switzerland or the US. Some of the available genomes are still under embargo (waiting for the publication of their creator). We are also thinking about sequencing more genomes. Here you can view these different datasets on a map. Please select and unselect the different values to have a view of the changes with added datasets.

  • “Usable genomes” are the genomes we have already and which have no embargo on publication.
  • “Present and future” are the isolates sequenced already and without embargo plus the planned sequencing
  • “Present, future and JGI” are the isolates described above plus the JGI genomes
#kable(Zt_meta %>% dplyr::count(Collection, name = "Number of genomes"))
Zt_meta %>% dplyr::count(Collection, name = "Number of genomes")
## # A tibble: 18 x 2
##    Collection             `Number of genomes`
##    <chr>                                <int>
##  1 BIOGER_Thierry                          17
##  2 Eschikon_2017                           94
##  3 ETHZ_2020                              128
##  4 Fillinger                                2
##  5 GWASpanel_BIOGER                        90
##  6 Hartmann_FstQst_2015                   132
##  7 Hartmann_Oregon_2016                    94
##  8 JGI                                     43
##  9 JGI_1                                    1
## 10 JGI_2                                   32
## 11 JGI_3                                   20
## 12 JGI_4                                    3
## 13 JGI_Thierry                             43
## 14 Megan_McDonald                          13
## 15 MMcDonald_2018                          92
## 16 Syngenta                               283
## 17 Third_batch_2018_BM_TM                  16
## 18 <NA>                                     6
max_circle = max(Zt_meta %>%
  dplyr::count(Latitude, Longitude, name = "Number_genomes") %>%
  dplyr::select(Number_genomes))

temp = Zt_meta %>%
   dplyr::count(Country, Latitude, Longitude, name = "Number_genomes") %>%
   filter(Number_genomes > 0)

empty_map = ggplot() + theme_void() +
  geom_polygon(data = world, aes(x=long, y = lat, group = group), fill="#ede7e3", alpha=0.7)

empty_map +
  geom_point (data = temp, aes(x=as.numeric(Longitude), y=as.numeric(Latitude),
                                    size=Number_genomes,
                                    text = Country),
                                  alpha = 0.6, color = "#16697a") +
  scale_size("Number of genomes", limits = c(1, max_circle))

p1 = empty_map +
  geom_point(data = temp, 
             aes(x=as.numeric(Longitude), y=as.numeric(Latitude),size=Number_genomes, text = Country),
             alpha = 0.6, color = "#ff9f1c") +
  scale_size("Number of genomes", limits = c(1, max_circle)) + 
  coord_cartesian(xlim=c(-20, 60), ylim=c(20, 70)) +
  theme(legend.position = "none")
p2 = empty_map +
  geom_point(data = temp, 
             aes(x=as.numeric(Longitude), y=as.numeric(Latitude),size=Number_genomes, text = Country),
             alpha = 0.6, color = "#ff9f1c") +
  scale_size("Number of genomes", limits = c(1, max_circle)) + coord_cartesian(xlim=c(-160, -40), ylim=c(-80, 80)) +
  theme(legend.position = "none")
p3 = empty_map +
  geom_point(data = temp, 
             aes(x=as.numeric(Longitude), y=as.numeric(Latitude),size=Number_genomes, text = Country),
             alpha = 0.6, color = "#ff9f1c") +
  scale_size("Number of genomes", limits = c(1, max_circle)) + coord_cartesian(xlim=c(115, 175), ylim=c(-60, 10))  


#Plotting all the maps together! 
aus_map = cowplot::plot_grid(p3+ theme(legend.position = "none"), get_legend(p3), ncol = 2, rel_widths = c(1, 0.9))
ligne = cowplot::plot_grid(p1, aus_map, ncol = 1,  rel_heights = c(1, 0.7))
cowplot::plot_grid(p2, ligne, ncol = 2, rel_widths = c(0.7, 1)) 

The sampling also covers a wide range of years: starting from 1990 to 2017. Just as with the geographical repartition, some years are heavily sampled, reflecting sampling in specific fields done for previous experiments.

temp = as_tibble(c(min(Zt_meta$Sampling_Date, na.rm = T) : max(Zt_meta$Sampling_Date, na.rm = T))) %>%
  mutate(`Sampling year` = as.character(value))

sum_temp = Zt_meta %>%
    mutate(`Sampling year` = as.character(Sampling_Date)) %>%
    dplyr::count(`Sampling year`, Continent) %>%
    full_join(., temp) %>%
    mutate(`Genome number` = replace_na(n, 0))

sum_temp %>%
ggplot(aes(x=`Sampling year`, y =`Genome number`, fill = Continent)) +
  geom_bar(stat = "identity") +
  theme_bw() + theme(axis.title = element_blank(),
                     axis.text.x = element_text(angle = 60, hjust = 1)) +
  Fill_Continent

countries = dplyr::count(Zt_meta, Country, Continent)

countries %>% filter(n >= 8) %>%
  ggplot(aes(x = Country, y =n, fill = Continent)) +
    geom_bar(stat = "identity") +
    Fill_Continent +
    theme_cowplot() +
    labs(x = element_blank(), y = "Number of samples") +
    theme(axis.text.x = element_text(angle = 40, hjust = 1, vjust = 1))

for (country in filter(countries, n >= 8) %>% pull(Country)) {
  country_name = str_replace(country, pattern = " ", replacement = "_")
  file_name = paste0(PopStr_dir, "Sample_list_", country_name, ".args")
  
  Zt_meta %>% filter(Country == country) %>%
    dplyr::select(ID_file) %>%
    write_tsv(file = file_name, col_names = F)
}
for ( min_number in c(6, 10) ) {
  
  small_pops = Zt_meta %>%
    filter(!is.na(Country) & !is.na(Sampling_Date)) %>%
    dplyr::count(Country, Latitude2, Longitude2, Sampling_Date, name = "Number_genomes") %>%
    filter( Number_genomes >= min_number ) 
  
  map1 = ggplot() + 
    theme_void() +
    geom_polygon(data = map_data("world"), aes(x=long, y = lat, group = group), fill="#ede7e3")  +
    scale_size("Number of genomes", limits = c(1, max_circle))  +
    theme(legend.position = "None", 
          panel.border  = element_rect(fill=NA, colour = "grey",size = 0.5, linetype = 1))+
      scale_fill_manual(values = c("grey", K_colors) )
  
  p1 = map1 + coord_cartesian(xlim=c(-20, 60), ylim=c(20, 70)) + 
    geom_point(data = small_pops, 
               mapping = aes(x=Longitude2, y=Latitude2, size=Number_genomes),
               alpha = 0.6, color = "#16697a")
  
  p2 = map1 + coord_cartesian(xlim=c(-160, -40), ylim=c(-80, 80)) + 
    geom_point(data = small_pops, 
               mapping = aes(x=Longitude2, y=Latitude2, size=Number_genomes),
               alpha = 0.6, color = "#16697a")
  
  p3 = map1 + coord_cartesian(xlim=c(115, 175), ylim=c(-65, 10)) + 
    geom_point(data = small_pops, 
               mapping = aes(x=Longitude2, y=Latitude2, size=Number_genomes),
               alpha = 0.6, color = "#16697a")
  
  aus_map = cowplot::plot_grid(p3, get_legend(p3), ncol = 2, rel_widths = c(1, 0.9))
  ligne = cowplot::plot_grid(p1, aus_map, ncol = 1,  rel_heights = c(1, 0.7))
  cowplot::plot_grid(p2, ligne, ncol = 2, rel_widths = c(0.7, 1)) 
  
  
  
  for (t in 1:nrow(small_pops)) {
    country = pull(small_pops[t, "Country"])
    year = pull(small_pops[t, "Sampling_Date"])
    lat = pull(small_pops[t, "Latitude2"])
    long = pull(small_pops[t, "Longitude2"])
    country_name = str_replace(country, pattern = " ", replacement = "_")
    
    for (i in 1:10 ){
      
      file_name = paste0(PopStr_dir, "Sample_list_subset_", 
                         paste(min_number, country_name, year, lat, long, i, sep = "_"), 
                         ".args")
      
      temp = Zt_meta %>% 
        filter(Country == country) %>%
        filter(Sampling_Date == year)  %>% 
        filter(Latitude2 == lat) %>%
        filter(Longitude2 == long) %>%
        dplyr::select(ID_file) %>%
        pull()
      
      write_tsv(as.tibble(sample(temp, size = min_number)), file = file_name, col_names = F) 
    }
  }
}



Variant calling


Whole chromosomes SNV

Question: How prelavent is aneuploidy in natural populations of Z.tritici? In the case of accessory chromosomes, is there a correlation between phylogeny, environment, host or time and the presence/absence of some chromosomes?

Methods: Based on the depth of coverage for all samples, we can identify for both core and accessory chromosomes whether each isolates includes 1, 0 or several copies.

core_depth = read_tsv(paste0(depth_per_window_dir, "Depth_per_sample_core_chr_summary.q30.txt")) %>%
  mutate(Median_core = Median)

chrom_depth = read_tsv(paste0(depth_per_window_dir, "Depth_per_chromosome_summary.q30.txt")) %>%
  left_join(., core_depth %>% 
  dplyr::select(Sample, Median_core)) %>%
  mutate(Relative_depth = Median/Median_core) %>%
  left_join(.,Zt_meta, by = c("Sample" = "ID_file")) %>%
  filter(CHROM != "mt") %>%
  mutate(Sample = fct_reorder(Sample, Sampling_Date)) %>%
  mutate(Sample = fct_reorder(Sample, Country)) %>%
  mutate(Sample = fct_reorder(Sample, Continent)) 

heatmap_depth = chrom_depth %>%
  filter(CHROM != "mt") %>%
  ggplot(aes(x = as.numeric(CHROM), y=Sample, fill=Relative_depth)) +
  geom_tile() + scale_fill_gradient2(low="white", high = "#479DAE") +
  geom_vline(xintercept = 13.5, linetype = "longdash", colour = "gray20") +
  theme(axis.text.y = element_blank(),
        axis.title.y = element_blank(), axis.ticks.y=element_blank()) +
  labs(fill = "Depth", x= "Chromosome") + xlim(c(0.5, 21.5))


plot_continent = chrom_depth %>%
  ggplot(aes(x = 1, y=Sample, fill=Continent)) +
  geom_tile(aes(width = 2))  +
  theme_classic() +
  theme(axis.text.y = element_blank(), axis.ticks.y=element_blank(),
        legend.position="left",
        axis.text.x=element_text(colour="white")) +
  labs(y= "Isolate") + Fill_Continent

plot_grid(plot_continent, heatmap_depth, rel_widths = c(2, 5))

In the heatmap, I represent the depth normalized by the median depth over all core chromosomes. As expected, the copy-number variation at the chromosome scale affects mostly the accessory chromosomes (AC). There is some presence of supernumerary AC and a lot of presence-absence variation. Supernumerary chromosomes can also be found in the core chromosomes but this is almost anecdotal as over the whole sampling this was found only in 9 cases.

Lthres = 0.50
Hthres = 1.50
depth = chrom_depth %>%
  dplyr::mutate(Depth_is = ifelse(Relative_depth > Hthres, "High",ifelse(Relative_depth < Lthres, "Low", "Normal"))) %>%
  mutate(CHROM = fct_reorder(CHROM, as.numeric(CHROM)))  

bar_Ndepth_per_CHR =ggplot(depth, aes(x = CHROM, fill = Depth_is)) +
  geom_bar(stat = "count") +
  scale_fill_manual(values =c("#16697a", "#82c0cc", "#EDE7E3")) +
    theme_light()+
    labs(x= "Chromosome", y = "Number of isolates")

#lollipop plots
##For high normalized depth values
temp = depth %>%
  filter(Depth_is == "High") %>%
  dplyr::group_by(CHROM) %>%
  dplyr::count() %>%
  mutate(CHROM = fct_reorder(CHROM, as.numeric(CHROM))) 
lolhigh =  ggplot(temp, aes(x = as.character(CHROM), y = n)) +
    geom_segment( aes(x=as.character(CHROM), xend=as.character(CHROM), y=0, yend=max(temp$n)),
                  color="grey80", size = 1) +
    geom_segment( aes(x=as.character(CHROM), xend=as.character(CHROM), y=0, yend=n),
                  color="grey20", size = 1) +
    geom_point( color="#16697a", size=4)  +
    geom_text(aes( label = n,
                     y= n), stat= "identity",
              hjust = -0.5, vjust = -0.2) +
    theme_light() +
    theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    axis.text.x = element_text(size = 10),
    axis.text.y = element_text(size = 10)
    ) +
    ylim(c(0,max(temp$n)+2+ max(temp$n)*0.1)) +
    labs(x= "Chromosome", y = "Number of isolates with supernumerary chromosome") +
    coord_flip()

##For low normalized depth values
temp = depth %>%
  filter(Depth_is == "Low") %>%
  dplyr::group_by(CHROM) %>%
  dplyr::count()%>%
  mutate(CHROM = fct_reorder(CHROM, as.numeric(CHROM)))

lollow = ggplot(temp, aes(x = CHROM, y = n)) +
    geom_segment( aes(x=CHROM, xend=CHROM, y=0, yend=max(temp$n)),
                  color="grey80", size = 1) +
    geom_segment( aes(x=CHROM, xend=CHROM, y=0, yend=n),
                  color="grey20", size = 1) +
    geom_point( color="#82c0cc", size=4)  +
    geom_text(aes( label = n,
                     y= n), stat= "identity",
              hjust = -0.5, vjust = -0.2) +
    theme_light() +
    theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    axis.text.x = element_text(size = 10),
    axis.text.y = element_text(size = 10)
    ) +
    ylim(c(0,max(temp$n)+ max(temp$n)*0.1)) +
    labs( x= "Chromosome", y = "Number of isolates without chromosome") +
    coord_flip()

bottom_row <- cowplot::plot_grid(lolhigh, lollow, labels = c('B', 'C'), label_size = 12)

plot_grid(bar_Ndepth_per_CHR, bottom_row, labels = c('A', ''), label_size = 12, ncol = 1)

Here is a table including the isolates with supernumerary core chromosomes.

depth  %>%
  dplyr::filter(Depth_is == "High") %>%
  dplyr::filter(as.numeric(CHROM) < 13) %>%
  dplyr::select(Sample, CHROM, Median, Median_core, Collection, Country)
## # A tibble: 18 x 6
##    Sample               CHROM Median Median_core Collection         Country     
##    <fct>                <fct>  <dbl>       <dbl> <chr>              <chr>       
##  1 08STCZ011            12        47          24 Syngenta           Czech Repub…
##  2 08STF035             5        154          79 Syngenta           France      
##  3 08STF036             12       159          82 Syngenta           France      
##  4 09STD041             4        157          86 Syngenta           Germany     
##  5 ST00ARG_BD0069       5         17           9 <NA>               <NA>        
##  6 ST13SP_Biog_SeptoDu… 4        100          49 BIOGER_Thierry     Spain       
##  7 ST16CH_2X10          1          2           1 <NA>               <NA>        
##  8 ST16CH_2X10          2          2           1 <NA>               <NA>        
##  9 ST16CH_2X10          3          2           1 <NA>               <NA>        
## 10 ST16DK_Biog_DK15     5         16           8 <NA>               <NA>        
## 11 ST90ORE_a12_3B_11    12        32          19 Hartmann_FstQst_2… USA_Oregon  
## 12 STnnJGI_SRR4235099   12        28          17 JGI_2              USA_Oregon  
## 13 STnnJGI_SRR4235109   5         33          17 JGI_2              Switzerland 
## 14 STnnJGI_SRR5194526   12        78          40 JGI                Sweden      
## 15 STnnJGI_SRR5194528   5         84          42 JGI                Sweden      
## 16 STnnJGI_SRR5194605   5         73          37 JGI                Hungary     
## 17 STnnJGI_SRR5829692   4        165          84 JGI                Kenya       
## 18 STnnJGI_SRR5829692   5        163          84 JGI                Kenya

And an overlook of the accessory chromosomes PAV per continent (only considering continents with more than 10 isolates).

depth %>%
  dplyr::filter(as.numeric(CHROM) > 13) %>%
  dplyr::group_by(Continent, CHROM) %>%
  dplyr::mutate(Count_sample_per_continent = n()) %>%
  dplyr::filter(Count_sample_per_continent >= 10) %>%
  ggplot(aes(x = Continent, fill = Depth_is)) +
  geom_bar(position = "fill" ) + facet_wrap(CHROM~.) +
  theme_light() +
  scale_fill_manual(values = c("#16697a", "#82c0cc", "#EDE7E3")) +
    theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
  ylab("Proportion of chromosomes") + xlab("")

GC bias

Estimates for the GC bias were done per isolate in the Depth.Rmd script. Here, I simply load this data and represent it graphically.

# 

GC_per_window = read_tsv(paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.1kb_windows.nuc_GC.tab")) %>%
  mutate(GC = round(`5_pct_gc`, 2)) %>%
  filter( !is.na(`#1_usercol`))

temp = GC_per_window %>%
  filter(`#1_usercol` < 14)  %>%
  dplyr::select(CHROM = `#1_usercol`, Win_start = `2_usercol`, Win_stop = `3_usercol`, GC) %>%
  group_by(GC) %>%
  dplyr::count() 

# Histogram of GC values
xint = median(GC_per_window$GC)
temp %>%
  ggplot(aes(x = GC, y = n, fill = n > 40)) +
    geom_bar(stat = "identity") +
    theme_light() +
  scale_fill_manual(values =c( "#82c0cc", "#EDE7E3"))+
  geom_vline(xintercept = xint, col = "#82c0cc") +
  annotate("text", label = paste0("Median GC is ", xint), 
           x = xint - xint*0.2, y = 5200, 
           size = 4, colour = "#82c0cc", fontface = "bold")

The distribution of GC is slightly skewed in the GC-rich side, with a median above 0.5. However, there is an AT-rich fat tail, or even a bimodal distribution. These windows are most probably containing RIP-affected repeated regions.

From there, we need to know if the different collections present GC bias in their sequencing. We expect a batch effect in the sequencing here since we have data coming from a lot of different years and labs (and because batch effect happens in general).

GC_bias = read_tsv(paste0(depth_per_window_dir, "GC_bias_per_sample.txt"))

temp = GC_bias %>%
  filter(Collection != "\\N") %>%
  group_by(Collection) %>%
  dplyr::count() %>%
  filter(n >= 20) 

inner_join(temp, GC_bias) %>%
  group_by(Collection) %>%
  mutate(Median = median(GC_bias_slope)) %>%
 ggplot(., aes(x = reorder(Collection, Median), y = GC_bias_slope, fill = reorder(Collection, Median))) +
  geom_violin(alpha = .7) +
  theme_bw()  +
  theme(axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1),
        plot.margin = unit(c(0.5, 0.5, 0.5, 1), "cm")) +
  labs(x = "Collection", fill = "Collection", y = "GC bias estimate")

It is very clear that we do have a batch effect but also that some datasets are very, very biased. The older Hartmann data (also called Qst population) in particular is strongly biased.



Population structure / biogeography


Question: Is the world-wide population of Z.tritici structured? If so, is it structured according to geography, host or time (or any other relevant info we hopefully have)?

Previous genomic work has shown very clear structure between populations of Z.tritici. However,the sampling was extremely heterogeneous. With a more geographically even sampling, do we also observe a clear-cut structure?
Methods: Analyses to create would be:

  • PCA
  • Structure-like analysis,
  • Tree (rooted on Za to infer origin if pattern?) #TODO
  • Map with cluster if clusters #TODO

Structure-like clustering

The clustering here is done by using the snmf method from the LEA R package (http://membres-timc.imag.fr/Olivier.Francois/LEA/) on the same subset of SNPs as the PCA, but without any missing data. I ran the analysis for a K (number of cluster inferred) ranging from 1 to 15 and with 10 repeats for each K.


vcftools --vcf ${VCFDIR}$VCFNAME.recode.vcf \
  --extract-FORMAT-info GT \
  --out ${POPSTR}$VCFNAME

cat  ${POPSTR}$VCFNAME.GT.FORMAT | cut -f 3- \
    > ${POPSTR}$VCFNAME.GT.FORMAT2
cat  ${POPSTR}$VCFNAME.GT.FORMAT | cut -f 1,2 \
    > ${POPSTR}$VCFNAME.GT.FORMAT.pos
head -n1 ${POPSTR}$VCFNAME.GT.FORMAT2 | gsed "s/\t/\n/g"  \
    > ${POPSTR}$VCFNAME.ind
gsed "s/\t//g"  ${POPSTR}$VCFNAME.GT.FORMAT2 | tail -n +2 \
    > ${POPSTR}$VCFNAME.geno
    
datamash transpose < ~/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp.even_sampling.GT.FORMAT2  |  sed 's/^/>/' | sed 's/\t/\n/' | sed 's/\t//g' | cut -c -150 > ~/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp.even_sampling.fasta
#project = snmf(paste0(PopStr_dir, vcf_name, ".geno"), K=1:15, entropy = TRUE,
#                        repetitions = 10, project = "new", ploidy = 1)
# Reading the results from the snmf runs
# ______________________________________
#Sample names
indv_snmf = read_tsv(paste0(PopStr_dir, vcf_name, ".ind"), col_names = F)
names(indv_snmf) = "Sample"

#Load project
project = load.snmfProject(paste0(PopStr_dir, vcf_name, ".snmfProject"))

K_list = c(1:15)

#Extracting the clustering info for the best run per K
datalist = list()
for (i in K_list){
  best = which.min(cross.entropy(project, K = i))
  temp = as.data.frame(Q(project, i, best))
  temp= cbind(indv_snmf, temp)

  temp = temp %>%
    gather("Cluster", "Admix_coef", -"Sample") %>%
    mutate(K=i)
   datalist[[i]] = as.tibble(temp)
}

snmf_results_per_K = bind_rows(datalist) %>%
  inner_join(., Zt_meta, by = c("Sample" = "ID_file")) %>%
  unite(Continent, Country, col = "for_display", remove = F)  


#sNMF pretty plots
# _______________
afiles = character(length(K_list))
for (i in K_list){
  best = which.min(cross.entropy(project, K = i))
  afiles[i] = Sys.glob(paste0(PopStr_dir, vcf_name, ".snmf/K",i, "/run", best, "/*Q"))
}

# create a qlist
qlist <- readQBasic(afiles)
al_qlist = alignK(qlist)

lab_set = inner_join(indv_snmf, Zt_meta, by = c("Sample" = "ID_file")) %>%
  dplyr::select(Continent, Country) %>%
  mutate(Continent = ifelse(is.na(Continent), "Unknown", Continent),
         Country = ifelse(is.na(Country), "Unknown", Country))

#Low numbers
from = 2
up_to = 6
p1 <-   plotQ(alignK(qlist[from:up_to], type = "across"),
            imgoutput="join",
            returnplot=T,exportplot=F,
            basesize=11,
            splab= paste0("K=", K_list[from:up_to]),
            grplab=lab_set, ordergrp=T, grplabangle = 40, grplabheight = 2, grplabsize = 2,
            clustercol = K_colors)

grid.arrange(p1$plot[[1]])

#Medium numbers
from = 7
up_to = 11
p2 <-   plotQ(alignK(qlist[from:up_to], type = "across"),
            imgoutput="join",
            returnplot=T,exportplot=F,
            basesize=11,
            splab= paste0("K=", K_list[from:up_to]),
            grplab=lab_set, ordergrp=T, grplabangle = 40, grplabheight = 2, grplabsize = 2,
            clustercol = K_colors)
grid.arrange(p2$plot[[1]])

#High numbers
from = 12
up_to = 15
p3 <-   plotQ(alignK(qlist[from:up_to], type = "across"),
            imgoutput="join",
            returnplot=T,exportplot=F,
            basesize=11,
            splab= paste0("K=", K_list[from:up_to]),
            grplab=lab_set, ordergrp=T, grplabangle = 40, grplabheight = 2, grplabsize = 2,
            clustercol = K_colors)
grid.arrange(p3$plot[[1]])

The results from the PCA and from the clutering analysis are coherent: Oceania separates into 3 clusters (one in New_Zealand, and two in Australia) and the North American isolates form two separate clusters. Higher K values also distinguish a Middle-Eastern/African cluster from the European cluster, representing the two extreme points of the gradient found between these populations in the PCA. Despite the high number of isolates from Europe, it’s interesting to see that no clustering appears there.

I need to identify the isolates which belong to each of the clusters. For this, we need to set a threshold, since there are very few (or even no) isolates assigned fully to one only. I test two thresholds: isolates assigned to one cluster with a value higher than 0.75 and 0.9.

#Setting thresholds to compare
chosen_threshold = 0.75
chosen_threshold2 = 0.9

pure_by_threshold = bind_rows(snmf_results_per_K %>%
    filter(Admix_coef > chosen_threshold) %>% 
    mutate(Threshold = paste0("> ", chosen_threshold)), 
  snmf_results_per_K %>%
    filter(Admix_coef > chosen_threshold2) %>%
    mutate(Threshold = paste0("> ", chosen_threshold2)))


# Number of pure isolates per K
pure_by_threshold %>%
    dplyr::group_by(K, Threshold) %>%
    dplyr::count() %>%
    ggplot(aes(x = as.factor(K), y = n, fill = Threshold)) +
       geom_bar(stat = "identity", position = "dodge") +
       theme_bw() + scale_fill_manual(values = c("#16697a", "#82c0cc"))+ 
    labs(x = "K", y = "Number of isolates", title = "Pure isolates per K") 

# Number of pure isolates per K per country
temp2 = pure_by_threshold %>%
  dplyr::group_by(Continent, for_display, Cluster, K, Threshold) %>%
  dplyr::count() 

temp2 %>%
  filter(K > 9 & K < 16) %>%
  ggplot(aes(x = Cluster, y = for_display,
             size = n, color = Continent)) +
  geom_point(alpha = 0.3) + Color_Continent+ theme_bw() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  facet_grid(rows = vars(Threshold), cols = vars(K), scales = "free_x") + 
    labs(x = "", y = "", size = "Nb of isolates",
         title = "Pure isolates per country and per K") 

Choosing the number K of clusters that best represent the data at hand is always a challenge. First, let’s look at the cross-validation results. sNMF estimates an entropy criterion which evaluates the quality of fit of the model to the data, potentially helping to find the best number of ancestral populations.

#Dot plot of the cross-entropy 
#plot(project, col = "goldenrod", pch = 19, cex = 1.2) #native method
cross_ent = list()
for (i in K_list){
  cross_ent[[i]] = as_tibble(cross.entropy(project, K = i), rownames = "NA") %>% 
    mutate( K = i)
  colnames(cross_ent[[i]] ) <- c("Run", "Crossentropy", "K")
}
cross_ent = bind_rows(cross_ent)
temp = cross_ent %>% group_by(K) %>% dplyr::summarize(average = mean(Crossentropy), minimum = min(Crossentropy))

p1 = ggplot() +
  geom_point(data = cross_ent, aes(x = as.factor(K), y = Crossentropy), alpha = 0.4, size = 2, col = "#82c0cc") +
  geom_point(data = temp, aes(x = as.factor(K), y = minimum), alpha = 0.4, size = 2, col = "#16697a") +
  theme_cowplot() +
    labs(x = "K", y = "Cross-entropy")

p2 = pure_by_threshold %>%
  filter(Threshold == paste0("> ", chosen_threshold)) %>%
  filter(K > 1) %>%
  dplyr::group_by(Cluster, K) %>%
  dplyr::count()%>%
  group_by(K) %>%
  dplyr::summarize(Size_smallest_cluster = min(n)) %>%
  ggplot(aes(x = as.factor(K), label = Size_smallest_cluster)) +
    geom_bar(aes(y = Size_smallest_cluster, fill = K), stat = "identity") + 
    theme_cowplot() +
    geom_label(aes(y = Size_smallest_cluster + 7)) +
    scale_fill_continuous(high = "#16697a", low = "#82c0cc") +
    labs(x = "K", y = "Size of the smallest cluster")

p3 = pure_by_threshold %>%
  filter(Threshold == paste0("> ", chosen_threshold)) %>%
    dplyr::group_by(K) %>%
    dplyr::count() %>%
    ggplot(aes(x = as.factor(K), y = n, fill = K)) +
       geom_bar(stat = "identity", position = "dodge") +
       theme_bw() +
       scale_fill_continuous(high = "#16697a", low = "#82c0cc") +
    labs(x = "K", y = "Number of assigned samples")

top_row = cowplot::plot_grid(p1, p3 + theme(legend.position = "none"), ncol = 2)
cowplot::plot_grid(top_row, p2 + theme(legend.position = "none"), ncol = 1)

In the case of our analysis, we do not have a very clear-cut minimum value for the cross-entropy criterion value. There is however a plateau starting from K=10. I also look at the number of samples assigned to one cluster or another based on the threshold set previously, as well as at the smallest cluster size. Even though, we might get hints that a very real cluster exists in one specific population, if the cluster size is very small, I will not be able to estimate anything with them so there is no point in going to the level of mini-clusters!

#These values are chosen based on the plot above
chosen_K = 11

write_tsv(snmf_results_per_K %>% filter(K == chosen_K), 
          file = paste0(PopStr_dir, vcf_name, ".snmf_results_chosen_K.tab"))

#Table
kable(snmf_results_per_K %>%
  filter(K == chosen_K) %>%
  filter(Admix_coef > chosen_threshold) %>%
  dplyr::group_by(Continent, Cluster) %>%
  dplyr::count() %>%
  pivot_wider(names_from = Continent, values_from =n, values_fill = 0))
Cluster Africa Europe Middle East North America Oceania South America NA
V11 21 8 2 0 0 0 0
V2 4 553 0 0 0 0 2
V6 0 0 40 0 0 0 0
V7 0 0 16 0 0 0 0
V10 0 0 0 178 0 0 1
V4 0 0 0 1 36 0 0
V8 0 0 0 20 0 0 0
V1 0 0 0 0 43 0 0
V3 0 0 0 0 31 0 0
V5 0 0 0 0 0 31 0
V9 0 0 0 0 0 16 0
#Looking at individuals with admixture coef higher than the threshold defined above.
high_anc_coef_snmf = snmf_results_per_K %>%
  filter(K == chosen_K) %>%
  filter(Admix_coef > chosen_threshold)

##Bubble plot
p_cluster = high_anc_coef_snmf %>%
  dplyr::group_by(Continent, for_display, Cluster) %>%
  dplyr::count() %>%
  ggplot(aes(x = for_display, y = Cluster,
             size = n, color = Continent)) +
  geom_point(alpha = 0.5) + Color_Continent+ theme_bw() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
    labs(x = "", 
         title = "Pure isolates per country for the chosen K value",
         subtitle = str_wrap(paste0("Number of genotypes with admix coef > ", chosen_threshold, 
                                    " assigned to ", chosen_K, " clusters."),
                             width = 70),
         size = "Nb of isolates") 
p_cluster

##Writing out tables for later
high_anc_coef_snmf %>% dplyr::select(Sample) %>%
  write_tsv(., file = paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.ind"),
            col_names = F)
high_anc_coef_snmf %>%
  write_tsv(., file = paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.tsv"),
            col_names = T)
max_continent = dplyr::count(high_anc_coef_snmf, Cluster, Continent) %>%
  group_by(Cluster) %>%
  mutate(somme = sum(n), prop = n/somme) %>%
  filter(prop > 0.5) %>%
  dplyr::select(Cluster, Continent)

clusters = dplyr::count(high_anc_coef_snmf, Cluster, Continent)

high_anc_coef_snmf %>% 
  dplyr::count(Cluster, Continent) %>%
  group_by(Cluster) %>%
  dplyr::mutate(Nb_per_cluster = sum(n)) %>%
  filter(!is.na(Continent)) %>%
  ggplot(aes(x = reorder(Cluster, Nb_per_cluster), y = n, fill = Continent)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    labs(x = element_blank(), y = "Number of samples") +
    theme(axis.text.x = element_text(angle = 40, hjust = 1, vjust = 1)) +
    Fill_Continent

#Files with all pure isolates per cluster
for (cluster in max_continent %>% pull(Cluster)) {
  file_name = paste0(PopStr_dir, "Sample_list_", cluster, ".args")
  high_anc_coef_snmf %>% filter(Cluster == cluster) %>%
    dplyr::select(Sample) %>%
    write_tsv(file = file_name, col_names = F)
}


#Files with a similar number pure isolates between cluster

minimum_cluster_size = min(dplyr::count(high_anc_coef_snmf, Cluster) %>% dplyr::select(n))

for (cluster in max_continent %>% pull(Cluster)) {
  for (i in 1:10 ){
    file_name = paste0(PopStr_dir, "Sample_list_", cluster, "_", i, ".args")
    temp = high_anc_coef_snmf %>% 
      filter(Cluster == cluster) %>%
      sample_n(size = minimum_cluster_size) %>%
      dplyr::select(Sample)
    write_tsv(temp, file = file_name, col_names = F) 
  }
}

The clustering data can also be represented as an average of ancestry coefficient per country. This is done here first with barplots.

chosen_coef = snmf_results_per_K %>% 
  filter(K == chosen_K) %>%
  filter(Country != "NA") %>%
  filter(Country != "USA_NA") %>%
  dplyr::select(Sample, Continent, Country, Admix_coef, Cluster, Latitude, Longitude) 

#Identifying the main cluster per country
cluster_per_country = high_anc_coef_snmf %>%
  group_by(Country, Cluster) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::mutate(freq = n / sum(n)) %>%
  filter(freq > 0.5) %>%
  dplyr::select(Country, Main_country_cluster = Cluster)

# Cluster composition per country for all continents
temp = chosen_coef %>%
  group_by(Continent, Country, Cluster) %>%
  dplyr::summarize(average_coef = mean(Admix_coef),
                   Nb_per_country = n()) %>%
  dplyr::mutate(Cluster2 = ifelse(average_coef < 0.1, "Other", Cluster))  %>%
  filter(Nb_per_country >= 6) 

#Bar plot per country
temp %>% filter(Continent != "NA") %>%
  filter(Continent != "Asia") %>%
  ggplot(aes(x = Country, y = average_coef, fill = Cluster2))  + 
    geom_bar(stat = "identity") +
    scale_fill_manual(values = c("grey", K_colors) ) + 
    theme_cowplot() +
     labs(subtitle = "All continents, limited to countries with 6 or more isolates",
         title = "Cluster ancestry per country",
         y = "Average of the ancestry coefficient", fill = "Cluster") +
    facet_grid(cols = vars(Continent), switch = "x", scales = "free_x", space = "free_x") +
    theme(axis.title.x = element_blank(), 
          axis.title.y = element_text(size = 12),
          axis.text.x = element_text(angle = 50, hjust = 1, vjust = 1, size = 8),
          axis.text.y = element_text(hjust = 1, vjust = 1, size = 10),
          panel.spacing = unit(0, "lines"), 
          strip.background = element_blank(),
          strip.placement = "bottom",
          strip.text = element_text(size = 10),
          legend.position = "bottom")

Such visualization is useful but not as intuitive as a map! Let’s use the same summarising method but with a more local viewpoint (I use rounded coordinates instead of country).

#Summarizing based on samples found in neighbouring areas
temp = chosen_coef  %>%
  mutate(Latitude = round(Latitude/2)*2, Longitude = round(Longitude/2)*2)%>%
  group_by(Continent, Country, Cluster, Longitude, Latitude) %>%
  summarize(average_coef = mean(Admix_coef), number_of_isolates = n()) %>%
  filter(!is.na(Latitude))

write_tsv(temp, file = paste0(PopStr_dir, vcf_name, ".chosen_coef_pies.tab"))

#Transforming to fit the scatter pie requirements
pies  = temp %>% #filter(number_of_isolates > 2) %>%
  mutate(Cluster2 = ifelse(average_coef < 0.1, "Other", Cluster)) %>%
  group_by(Continent, Longitude, Latitude, Country, Cluster2, number_of_isolates) %>%
  dplyr::summarize(to_plot = sum(average_coef)) %>%
  arrange(Cluster2) %>%
  pivot_wider(names_from = Cluster2, values_from = to_plot, values_fill = 0) %>%
  mutate(radius = ifelse(number_of_isolates > 10, 1.5, log(number_of_isolates))) %>%
  dplyr::select(-number_of_isolates)  




# Simple map of the world
K_colors = c("#0D6E82", #V1 Aus (TAS)
             "#49810E", #V10 USA
             "#E16684", #V11 North Africa
             "#FFBA0A", #V2 Europe
             "#C7F1F9", #V3 NZ
             "#21C7E8", #V4 Australia (NSW)
             "#8FA253", #V5 Uruguay + Argentina
             "#650104", #V6 Israel + Turkey
             "#DE020A", #V7 Iran
             "#2A4908", #V8 Canada
             "#B3C186") #V9 Boliva + Chile + Ecuador


map1 = ggplot() + 
  theme_void() +
  geom_polygon(data = map_data("world"), aes(x=long, y = lat, group = group), fill="#ede7e3")  +
  scale_size("Number of genomes", limits = c(1, max_circle))  +
  theme(legend.position = "None", 
        panel.border  = element_rect(fill=NA, colour = "grey",size = 0.5, linetype = 1)) +
    scale_fill_manual(values = c(c("grey"), K_colors))

#Splitting the map into our 3 main focus areas
p1 = map1 + coord_cartesian(xlim=c(-20, 60), ylim=c(20, 70)) + 
  geom_scatterpie(data = pies, mapping = aes(x=Longitude, y=Latitude, group=Country, r = radius), 
                  cols = c("Other", "V1", "V10", "V11", "V2", "V3", "V4", 
                           "V5", "V6", "V7", "V8", "V9"), color=NA, alpha=.8)
p2 = map1 + coord_cartesian(xlim=c(-160, -40), ylim=c(-80, 80)) + 
  geom_scatterpie(data = pies, mapping = aes(x=Longitude, y=Latitude, group=Country, r = radius*2), 
                   cols = c("Other", "V1", "V10", "V11", "V2", "V3", "V4", 
                           "V5", "V6", "V7", "V8", "V9"), color=NA, alpha=.8)
p3 = map1 + coord_cartesian(xlim=c(115, 185), ylim=c(-65, 10)) + 
  geom_scatterpie(data = pies, mapping = aes(x=Longitude, y=Latitude, group=Country, r = radius*2), 
                   cols = c("Other", "V1", "V10", "V11", "V2", "V3", "V4", 
                           "V5", "V6", "V7", "V8", "V9"), color=NA, alpha=.8)

#Plotting all the maps together! 
aus_map = cowplot::plot_grid(p3, get_legend(p1), ncol = 2, rel_widths = c(1, 0.75))
ligne = cowplot::plot_grid(p1, aus_map, ncol = 1,  rel_heights = c(1, 0.7))
cowplot::plot_grid(p2, ligne, ncol = 2, rel_widths = c(0.75, 1)) 

#ggsave(paste0(fig_dir, "Str_scatter_pie.pdf"), width = 18, height = 10, units = "cm")

For each country, we define the local cluster with “votes”: the cluster to which more than half of the non-admixed isolates are originating is considered to be the local cluster. We can quantify for each country the isolates which are non-admixed (or “pure”) isolates from the local cluster, non-admixed from another cluster, or hybrid.

#For each sample, identify if is hybrid, local pure or pure from somewhere else.
status_admix = inner_join(chosen_coef, cluster_per_country) %>% 
  group_by(Sample, Continent, Country) %>%
  dplyr::mutate(max_admix = max(Admix_coef),
         max_cluster = ifelse(Admix_coef == max_admix, Cluster, "")) %>%
  filter(max_cluster != "") %>%
  dplyr::mutate(Status = ifelse(max_admix < chosen_threshold, "Hybrid",
                         ifelse(max_cluster == Main_country_cluster, "Pure local", 
                                "Pure other")))

# Summary of hybrid category per country
temp = status_admix %>% 
  group_by(Continent, Country, Main_country_cluster) %>% 
  dplyr::count(Status)

## As table
kable(pivot_wider(temp, names_from = Status, values_from = n, values_fill = 0))
Continent Country Main_country_cluster Hybrid Pure local Pure other
Africa Algeria V11 3 5 0
Africa Kenya V2 4 2 0
Africa Morocco V11 0 2 0
Africa Tunisia V11 1 14 2
Europe Belgium V2 0 14 0
Europe Czech Republic V2 2 18 0
Europe Denmark V2 0 9 0
Europe France V2 6 228 4
Europe Germany V2 1 63 0
Europe Ireland V2 0 36 0
Europe Italy V11 1 2 0
Europe Latvia V2 0 2 0
Europe Netherlands V2 2 6 0
Europe Poland V2 0 5 0
Europe Spain V11 4 2 0
Europe Sweden V2 0 6 0
Europe Switzerland V2 2 135 0
Europe UK V2 0 31 0
Middle East Iran V7 6 16 0
Middle East Israel V6 0 39 0
Middle East Syria V11 6 1 0
Middle East Turkey V11 10 1 0
Middle East Yemen V6 0 1 0
North America Canada V8 0 11 0
North America USA_California V10 0 7 0
North America USA_Indiana V10 4 7 0
North America USA_Minnesota V8 0 2 0
North America USA_Missouri V10 3 7 0
North America USA_North Dakota V8 0 7 0
North America USA_Ohio V10 0 3 0
North America USA_Oregon V10 0 145 0
North America USA_Texas V10 1 7 0
Oceania Australia_NA V4 1 9 3
Oceania Australia_NSW V4 0 27 0
Oceania Australia_Tasmania V1 0 40 0
Oceania New Zealand V3 24 31 0
South America Argentina V5 0 16 0
South America Bolivia V9 0 1 0
South America Chile V9 0 14 0
South America Ecuador V9 1 1 0
South America Uruguay V5 1 15 0
## As a figure
ungroup(temp) %>%
  dplyr::mutate(Nb_per_country = sum(n)) %>%
  filter(Nb_per_country >= 10) %>%
  ggplot(aes(x = Country, y = n, fill = Status)) +
    geom_bar(stat = "identity", position = "fill") + 
    theme_cowplot() +
    scale_fill_manual(values =c("#16697a", "#EDE7E3", "#82c0cc")) +
    labs(y = "Proportion of isolates")+
    facet_grid(rows = vars(Continent), switch = "y", scales = "free_y", space = "free_y") +
    theme(axis.title.y = element_blank(), 
          axis.title.x = element_text(size = 14),
          axis.text.x = element_text(size =10),
          axis.text.y = element_text(hjust = 1, vjust = 1, size = 8),
          panel.spacing = unit(0, "lines"), 
          strip.background = element_blank(),
          strip.placement = "bottom",
          strip.text = element_text(size = 10),
          legend.position = "top") +
  coord_flip()

    #facet_wrap(vars(Continent), scales = "free")
#Table hybrids assigned to several clusters, and pure foreign
temp = inner_join(chosen_coef, 
                  status_admix %>% 
                    filter(Status != "Pure local") %>% 
                    dplyr::select(Sample, Status)) %>%
  dplyr::select(Sample, Continent, Country, Latitude, Longitude, Status, Cluster, Admix_coef) %>%
  mutate(Admix_coef = ifelse(Admix_coef > 0.2, round(Admix_coef, 4), NA)) %>%
  pivot_wider(names_from = Cluster, values_from = Admix_coef) %>%
  rowwise() %>%
  dplyr::mutate(Count_NA = sum(is.na(c_across(V1:V11)))) %>%
  filter(Status == "Pure other" | Count_NA < 10)

opts <- options(knitr.kable.NA = "")
kable(temp, digits = 3, )
Sample Continent Country Latitude Longitude Status V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 Count_NA
Indiana_1993_I25a_1 North America USA_Indiana 40.420 -86.920 Hybrid 0.224 0.706 9
Missouri_1994_ST94MO15a_1 North America USA_Missouri 38.890 -92.210 Hybrid 0.221 0.698 9
ST08TN_26_5_4 Africa Tunisia 36.810 10.179 Pure other 0.805 10
ST13FR_Biog_SeptoDur10 Europe France 44.218 0.448 Hybrid 0.368 0.608 9
ST13FR_Biog_SeptoDur12 Europe France 44.218 0.448 Hybrid 0.449 0.548 9
ST13FR_Biog_SeptoDur29 Europe France 44.308 4.346 Pure other 0.765 10
ST13FR_Biog_SeptoDur8 Europe France 44.218 0.448 Hybrid 0.389 0.540 9
ST13FR_Biog_SeptoDur82 Europe France 44.308 4.346 Pure other 0.834 10
ST13FR_Biog_SeptoDur88 Europe France 44.308 4.346 Pure other 0.880 10
ST13IT_Biog_1819 Europe Italy 41.891 12.492 Hybrid 0.542 0.245 9
ST13NZ_St13_5_4 Oceania New Zealand -41.211 174.777 Hybrid 0.246 0.488 9
ST15NZ_St_15640_1 Oceania New Zealand -40.134 175.658 Hybrid 0.270 0.562 9
ST15NZ_St_15640_2 Oceania New Zealand -40.134 175.658 Hybrid 0.263 0.583 9
ST15NZ_St_15640_7 Oceania New Zealand -40.134 175.658 Hybrid 0.300 0.525 9
ST15NZ_St_15640_8 Oceania New Zealand -40.134 175.658 Hybrid 0.257 0.570 9
ST15NZ_St_15642_11 Oceania New Zealand -46.219 169.492 Hybrid 0.562 0.204 9
ST15NZ_St_15642_12 Oceania New Zealand -46.219 169.492 Hybrid 0.213 0.580 9
ST15NZ_St_15642_13 Oceania New Zealand -46.219 169.492 Hybrid 0.220 0.593 9
ST_SRR2834990 Oceania Australia_NA -35.296 149.122 Pure other 0.967 10
ST_SRR2835000 Oceania Australia_NA -35.296 149.122 Pure other 0.978 10
ST_SRR2835057 Oceania Australia_NA -35.296 149.122 Pure other 0.915 10
STnnJGI_SRR3452689 Middle East Turkey 39.931 32.833 Hybrid 0.252 0.622 9
STnnJGI_SRR3452700 Africa Algeria 36.818 3.048 Hybrid 0.342 0.304 9
STnnJGI_SRR3452702 Middle East Syria 33.517 36.316 Hybrid 0.259 0.564 9
STnnJGI_SRR3452704 Africa Algeria 36.818 3.048 Hybrid 0.338 0.302 9
STnnJGI_SRR3452713 Europe France 48.853 2.347 Pure other 0.885 10
STnnJGI_SRR3452770 Middle East Syria 33.517 36.316 Hybrid 0.319 0.635 9
STnnJGI_SRR3452776 Africa Tunisia 36.810 10.179 Pure other 0.844 10
STnnJGI_SRR4235083 Africa Algeria 36.818 3.048 Hybrid 0.352 0.236 9
STnnJGI_SRR4235084 Oceania New Zealand -41.211 174.777 Hybrid 0.405 0.292 0.249 8
STnnJGI_SRR4235085 Africa Tunisia 36.810 10.179 Hybrid 0.401 0.236 9
STnnJGI_SRR4235089 Oceania New Zealand -41.211 174.777 Hybrid 0.406 0.234 0.265 8
STnnJGI_SRR5194479 Europe Spain 40.416 -3.708 Hybrid 0.322 0.218 0.216 8
STnnJGI_SRR5194515 Europe Spain 40.416 -3.708 Hybrid 0.265 0.201 0.246 8
STnnJGI_SRR5194596 Middle East Iran 35.727 51.385 Hybrid 0.222 0.384 9
STnnJGI_SRR5194607 Middle East Syria 33.500 35.800 Hybrid 0.277 0.619 9
STnnJGI_SRR5829674 Oceania New Zealand -41.211 174.777 Hybrid 0.440 0.249 0.264 8
Syria_1992_SYK2 Middle East Syria 33.517 36.316 Hybrid 0.285 0.583 9
Syria_1992_SYT3 Middle East Syria 36.010 36.940 Hybrid 0.239 0.596 9
write_tsv(x = temp, file = paste0(PopStr_dir, "Hybrids_and_pure_foreign.tab"))



# Simple map of the world
map1 = ggplot() + 
  theme_void() +
  geom_polygon(data = map_data("world"), aes(x=long, y = lat, group = group), fill="#ede7e3")  +
  theme(panel.border  = element_rect(fill=NA, colour = "grey",size = 0.5, linetype = 1))

#Splitting the map into our 3 main focus areas
p1 = map1 + coord_cartesian(xlim=c(-20, 60), ylim=c(20, 70)) + 
  geom_jitter(data = temp, aes(Longitude, Latitude, col = Status), width = 1, height = 1) 
p2 = map1 + coord_cartesian(xlim=c(-160, -40), ylim=c(-80, 80))+ 
  geom_jitter(data = temp, aes(Longitude, Latitude, col = Status), width = 1, height = 1) 
p3 = map1 + coord_cartesian(xlim=c(115, 175), ylim=c(-65, 10)) + 
  geom_jitter(data = temp, aes(Longitude, Latitude, col = Status), width = 2, height = 2) 

#Plotting all the maps together! 
aus_map = cowplot::plot_grid(p3 + theme(legend.position = "none"), get_legend(p1), ncol = 2, rel_widths = c(1, 0.9))
ligne = cowplot::plot_grid(p1 + theme(legend.position = "none"), aus_map, ncol = 1,  rel_heights = c(1, 0.7))
cowplot::plot_grid(p2 + theme(legend.position = "none"), ligne, ncol = 2, rel_widths = c(0.7, 1)) 

#ggsave(paste0(fig_dir, "Str_scatter_pie.pdf"), width = 18, height = 10, units = "cm")

Principal Component Analysis

As a first method to investigate the population structure of Z.tritici at the world-wide scale, I chose to do a principal component analysis based on a subset of the SNPs. This PCA confirms previous results of a geographically structured species. Oceania emerges quite distincly as three separate clusters: one in New-Zealand and two Australian (see below for a more in-depth analysis of this pattern). North-America is also quite serapate. The distinction between the rest of the geographical location is not clear in this PCA, although PC3 might show a slight differenciation between Europe and the Middle-East.

snpgdsVCF2GDS(paste0(vcf_dir, vcf_name, ".recode.vcf"),
              paste0(PopStr_dir, vcf_name, ".recode.gds"), method="biallelic.only")
genofile <- snpgdsOpen(paste0(PopStr_dir, vcf_name, ".recode.gds"))
pca <-snpgdsPCA(genofile)
snpgdsClose(genofile)

pca2 = as_tibble(pca$eigenvect) %>% dplyr::select(V1:V8)
colnames(pca2) = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8")
pca2 = pca2 %>%
  dplyr::mutate(sample_id = pca$sample.id ) %>%
  dplyr::right_join(., status_admix, by = c("sample_id" = "Sample")) %>%
  filter(!is.na(PC1))

#Writing table to store PCA results
pca2 %>%
dplyr::select(ID_file = sample_id, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8, 
              Continent, Country, Latitude, Longitude) %>%
write_tsv(., file = paste0(PopStr_dir, vcf_name, ".PCA_results.tab"),
            col_names = T)

#
as.tibble(pca$eigenval[!is.na(pca$eigenval)]) %>%
  ggplot(aes(x = c(1:length(pca$eigenval[!is.na(pca$eigenval)])),
             y =value)) + 
  geom_point() +
  theme_bw()

eigen_sum = sum(pca$eigenval[!is.na(pca$eigenval)])
for_plot = pca2 %>% mutate(Cluster = ifelse(Status == "Hybrid", Status, Cluster))

#Defining colors per cluster based on cluster
myColors2 <- c("grey", "#129eba", "#3F6E0C", "#DA4167", "#ffba0a", "#129eba", "#129eba", 
               "#8fa253", "#A20106", "#A20106", "#3F6E0C", "#8fa253")
myShapes <- c(1, 1, 1, 1, 1, 2, 0, 
               1, 1, 0, 0, 0)

names(myColors2) = levels(factor(for_plot$Cluster))
names(myShapes) = levels(factor(for_plot$Cluster))
Color_Cluster = ggplot2::scale_colour_manual(name = "Cluster", values = myColors2)
Fill_Cluster = ggplot2::scale_fill_manual(name = "Cluster", values = myColors2)
Shape_Cluster = ggplot2::scale_shape_manual(name = "Cluster", values = myShapes)

# Dot plots
p1 = ggplot(for_plot, aes(x = PC1, y= PC2, shape = Cluster)) +
  geom_point(aes(color = Cluster)) +
  labs(x = paste0("PC 1 (", round(pca$eigenval[1]*100/eigen_sum, 2), "%)"),
       y = paste0("PC 2 (", round(pca$eigenval[2]*100/eigen_sum, 2), "%)")) +
  Color_Cluster + Shape_Cluster + 
  theme_bw()

p2 = ggplot(for_plot, aes(x = PC3, y= PC4, shape = Cluster)) +
  geom_point(aes(color = Cluster)) +
  labs(x = paste0("PC 3 (", round(pca$eigenval[3]*100/eigen_sum, 2), "%)"),
       y = paste0("PC 4 (", round(pca$eigenval[4]*100/eigen_sum, 2), "%)")) +
  Color_Cluster + Shape_Cluster +
  theme_bw()

p3 = ggplot(for_plot, aes(x = PC5, y= PC6, shape = Cluster)) +
  geom_point(aes(color = Cluster)) +
  labs(x = paste0("PC 5 (", round(pca$eigenval[5]*100/eigen_sum, 2), "%)"),
       y = paste0("PC 6 (", round(pca$eigenval[6]*100/eigen_sum, 2), "%)")) +
  Color_Cluster + Shape_Cluster +
  theme_bw()

p4 = ggplot(for_plot, aes(x = PC7, y= PC8, shape = Cluster)) +
  geom_point(aes(color = Cluster)) +
  labs(x = paste0("PC 7 (", round(pca$eigenval[7]*100/eigen_sum, 2), "%)"),
       y = paste0("PC 8 (", round(pca$eigenval[8]*100/eigen_sum, 2), "%)")) +
  Color_Cluster + Shape_Cluster +
  theme_bw()

PCA_grid  = cowplot::plot_grid(p1 + theme(legend.position = "none"), p2 + theme(legend.position = "none"),
                               p3 + theme(legend.position = "none"), p4 + theme(legend.position = "none"))
cowplot::plot_grid(PCA_grid, get_legend(p1 + theme(legend.position = "bottom")), ncol =1, rel_heights = c(1, 0.2))

p11 = filter(for_plot, Cluster != "Hybrid") %>%
  ggplot(aes(PC1, color = Cluster, fill = Cluster)) + 
  geom_density(alpha = .3)+
  Color_Cluster + Fill_Cluster +
  theme_cowplot() + 
  theme(axis.title = element_blank(), axis.text = element_blank(),
        plot.margin = unit(c(0,0,0,0), "cm"), axis.ticks = element_blank())

p12 = filter(for_plot, Cluster != "Hybrid") %>%
  ggplot(aes(PC2, color = Cluster, fill = Cluster)) + 
  geom_density(alpha = .3)+
  Color_Cluster + Fill_Cluster +
  theme_cowplot() + 
  theme(axis.title = element_blank(), axis.text = element_blank(),
        plot.margin = unit(c(0,0,0,0), "cm"), axis.ticks = element_blank())

p13 = filter(for_plot, Cluster != "Hybrid") %>%
  ggplot(aes(PC3, color = Cluster, fill = Cluster)) + 
  geom_density(alpha = .3)+
  Color_Cluster + Fill_Cluster +
  theme_cowplot() + 
  theme(axis.title = element_blank(), axis.text = element_blank(),
        plot.margin = unit(c(0,0,0,0), "cm"), axis.ticks = element_blank())

cowplot::plot_grid(p11+ theme(legend.position = "none"), 
                   p12 + theme(legend.position = "none"), 
                   p13 + theme(legend.position = "none"),
                   ncol = 1, align = "hv") 


ggMarginal(ggplot(for_plot, aes(x = PC2, y= PC3, shape = Cluster)) +
  geom_point(aes(color = Cluster)) +
  labs(x = paste0("PC 2 (", round(pca$eigenval[2]*100/eigen_sum, 2), "%)"),
       y = paste0("PC 3 (", round(pca$eigenval[3]*100/eigen_sum, 2), "%)")) +
  Color_Cluster + Shape_Cluster +
  theme_bw() + theme(legend.position = "bottom"), groupColour = TRUE, groupFill = TRUE)

ggplot2::theme_set(theme_light())
p = ggpairs(pca2, columns = c(1:6),
            ggplot2::aes(col=Continent, fill = Continent, alpha = 0.6),
            title = "PCA based thinned SNPs",
            upper = list(continuous = "points", combo = "box_no_facet"))

for(i in 1:p$nrow) {
  for(j in 1:p$ncol){
    p[i,j] <- p[i,j] + theme_light() + Color_Continent + Fill_Continent
  }
}

p

Population trees

In the steps before, I have learned about population history indirectly by inferring genetic populations from the genomic data. The relationship between the population and the underlying demography is not explicit in these however. It is possible however to infer splits between populations and create a population tree. Here, I use treemix, which takes into account the possibility of gene flow between populations and indeed test of it in the process of creating a population tree.

Because the populations in the clustering were not perfectly distinct from one another, I start with “discretized” populations by choosing only the isolates with high ancestry in one of the sNMF clusters.

~/Documents/Software/vcftools_jydu/src/cpp/vcftools \
  --vcf ${VCFDIR}$VCFNAME.recode.vcf \
  --keep ${POPSTR}$VCFNAME.high_anc_coef_snmf.ind \
  --remove-filtered-all --extract-FORMAT-info GT \
  --max-missing 1.0 --min-alleles 2 --max-alleles 2 \
  --maf 0.05 \
  --out ${POPSTR}$VCFNAME.high_anc_coef_snmf

cat  ${POPSTR}$VCFNAME.high_anc_coef_snmf.GT.FORMAT | cut -f 3- \
   >  ${POPSTR}$VCFNAME.high_anc_coef_snmf.GT.FORMAT2
#conda activate r-reticulate
from collections import defaultdict

#For each isolate, store its pop (as in sampling site) in a dictionary
dict_pop = dict(zip(r.high_anc_coef_snmf["Sample"],
    r.high_anc_coef_snmf["Cluster"]))

#Keep a list of the pop names/coordinates to write in the same order later
all_pops = sorted(list(set(r.high_anc_coef_snmf["Cluster"])))
#out_name = r.PopStr_dir + r.vcf_name + ".high_anc_coef_snmf.treemix"
out_name =  "/Users/afeurtey/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp.high_anc_coef_snmf.treemix"

out = open(out_name, "w")
shutup = out.write(" ".join(all_pops) + "\n")

#with open(r.PopStr_dir + r.vcf_name + ".high_anc_coef_snmf.GT.FORMAT2", "r") as input_snps :
with open("/Users/afeurtey/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp.high_anc_coef_snmf.GT.FORMAT2", "r") as input_snps :
  for i, snp in enumerate(input_snps) :
    
    #Setting two dictionaries with values at 0
    dict_snp0 = defaultdict(int)
    dict_snp1 = defaultdict(int)
    Lets_write = True
    
    #The first line is the name of the isolates
    if i == 0 :
      indv = snp.strip().split("\t")
      Lets_write = False
    else :
      #Keeping isolate name and allelic value together
      alleles = zip(indv, snp.strip().split("\t"))
            
      #...and counting the O and 1 based on the pop
      for ind, allele in alleles:
        if allele == "0" :
          dict_snp0[dict_pop[ind]] += 1
        elif allele == "1" :
          dict_snp1[dict_pop[ind]] += 1
        else :
          print("Only biallelic please!!!!")
          Lets_write = False
    #If I have not found anything weird, I will write the result to the output file.
    if Lets_write :
      shutup = out.write(" ".join([",".join([str(dict_snp0[pop]), str(dict_snp1[pop])])  for pop in all_pops]) + "\n")


print("All done!")
out.close()
POPSTR="/Users/afeurtey/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/"

if [ -f ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix ] ;
then
  gzip ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix
fi

treemix \
  -i ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix.gz \
  -o ${POPSTR}$VCFNAME.high_anc_coef_snmf.m0.treemix.out \
  -m 0 -root V7

treemix \
  -i ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix.gz \
  -o ${POPSTR}$VCFNAME.high_anc_coef_snmf.m1.treemix.out \
  -m 1 -root V7

treemix \
  -i ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix.gz \
  -o ${POPSTR}$VCFNAME.high_anc_coef_snmf.m2.treemix.out \
  -m 2 -root V7

treemix \
  -i ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix.gz \
  -o ${POPSTR}$VCFNAME.high_anc_coef_snmf.m3.treemix.out \
  -m 3 -root V7
  
treemix \
  -i ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix.gz \
  -o ${POPSTR}$VCFNAME.high_anc_coef_snmf.m4.treemix.out \
  -m 4 -root V7

treemix \
  -i ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix.gz \
  -o ${POPSTR}$VCFNAME.high_anc_coef_snmf.m5.treemix.out \
  -m 5 -root V7

treemix \
  -i ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix.gz \
  -o ${POPSTR}$VCFNAME.high_anc_coef_snmf.m6.treemix.out \
  -m 6 -root V7
source("~/Documents/Software/treemix-1.13/src/plotting_funcs.R")
p_cluster

t = plot_tree(paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.m0.treemix.out"))
##     V1   V2       V3      V4      V5  V6  V7 V8  V9 V10
## 1    0 <NA> NOT_ROOT NOT_MIG NOT_TIP  16  75  1 211   1
## 2    1   V5 NOT_ROOT NOT_MIG     TIP   2  NA NA  NA  NA
## 3    2 <NA> NOT_ROOT NOT_MIG NOT_TIP  76   3  1   1   1
## 4    3   V9 NOT_ROOT NOT_MIG     TIP   2  NA NA  NA  NA
## 5    4   V6 NOT_ROOT NOT_MIG     TIP  52  NA NA  NA  NA
## 6   15   V3 NOT_ROOT NOT_MIG     TIP  32  NA NA  NA  NA
## 7   16 <NA> NOT_ROOT NOT_MIG NOT_TIP  76 172  4   0   2
## 8   31   V1 NOT_ROOT NOT_MIG     TIP 104  NA NA  NA  NA
## 9   32 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 104  2  15   1
## 10  51   V7 NOT_ROOT NOT_MIG     TIP 213  NA NA  NA  NA
## 11  52 <NA> NOT_ROOT NOT_MIG NOT_TIP 213   4  1 136   9
## 12  75   V8 NOT_ROOT NOT_MIG     TIP   0  NA NA  NA  NA
## 13  76 <NA> NOT_ROOT NOT_MIG NOT_TIP 136   2  2  16   6
## 14 103   V4 NOT_ROOT NOT_MIG     TIP 104  NA NA  NA  NA
## 15 104 <NA> NOT_ROOT NOT_MIG NOT_TIP  32  31  1 103   1
## 16 135  V11 NOT_ROOT NOT_MIG     TIP 136  NA NA  NA  NA
## 17 136 <NA> NOT_ROOT NOT_MIG NOT_TIP  52 135  1  76   8
## 18 171   V2 NOT_ROOT NOT_MIG     TIP 172  NA NA  NA  NA
## 19 172 <NA> NOT_ROOT NOT_MIG NOT_TIP  16  32  3 171   1
## 20 211  V10 NOT_ROOT NOT_MIG     TIP   0  NA NA  NA  NA
## 21 213 <NA>     ROOT NOT_MIG NOT_TIP 213  51  1  52  10
##                                                                                                                                                                                                                                                                V11
## 1                                                                                                                                                                                                                           (V8:0.0462969,V10:0.0279962):0.0095659
## 2                                                                                                                                                                                                                                                     V5:0.0114564
## 3                                                                                                                                                                                                                            (V9:0.031812,V5:0.0114564):0.00720251
## 4                                                                                                                                                                                                                                                      V9:0.031812
## 5                                                                                                                                                                                                                                                    V6:0.00531978
## 6                                                                                                                                                                                                                                                      V3:0.031223
## 7                                                                                                                       ((((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.004535,(V8:0.0462969,V10:0.0279962):0.0095659):0.0037794
## 8                                                                                                                                                                                                                                                      V1:0.045482
## 9                                                                                                                                                                                                    ((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577
## 10                                                                                                                                                                                                                                                    V7:0.0137477
## 11                 (V6:0.00531978,(V11:0.0239367,((V9:0.031812,V5:0.0114564):0.00720251,((((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.004535,(V8:0.0462969,V10:0.0279962):0.0095659):0.0037794):0.018261):0.0118456):0.0137477
## 12                                                                                                                                                                                                                                                    V8:0.0462969
## 13                                                                     ((V9:0.031812,V5:0.0114564):0.00720251,((((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.004535,(V8:0.0462969,V10:0.0279962):0.0095659):0.0037794):0.018261
## 14                                                                                                                                                                                                                                                    V4:0.0536008
## 15                                                                                                                                                                                                                            (V1:0.045482,V4:0.0536008):0.0126766
## 16                                                                                                                                                                                                                                                   V11:0.0239367
## 17                                           (V11:0.0239367,((V9:0.031812,V5:0.0114564):0.00720251,((((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.004535,(V8:0.0462969,V10:0.0279962):0.0095659):0.0037794):0.018261):0.0118456
## 18                                                                                                                                                                                                                                                  V2:0.000271025
## 19                                                                                                                                                                         (((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.004535
## 20                                                                                                                                                                                                                                                   V10:0.0279962
## 21 (V7:0.0137477,(V6:0.00531978,(V11:0.0239367,((V9:0.031812,V5:0.0114564):0.00720251,((((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.004535,(V8:0.0462969,V10:0.0279962):0.0095659):0.0037794):0.018261):0.0118456):0.0137477);
##             x          y       ymin       ymax
## 1  0.05719960 0.09090909 0.00000000 0.18181818
## 2  0.06251321 0.59090909 0.54545455 0.63636364
## 3  0.05105681 0.63636364 0.54545455 0.72727273
## 4  0.08286881 0.68181818 0.63636364 0.72727273
## 5  0.01906748 0.86363636 0.81818182 0.90909091
## 6  0.09325747 0.31818182 0.27272727 0.36363636
## 7  0.04763370 0.18181818 0.00000000 0.54545455
## 8  0.12019307 0.50000000 0.45454545 0.54545455
## 9  0.06203447 0.36363636 0.27272727 0.54545455
## 10 0.01374770 0.95454545 0.90909091 1.00000000
## 11 0.01374770 0.81818182 0.00000000 0.90909091
## 12 0.10349650 0.13636364 0.09090909 0.18181818
## 13 0.04385430 0.54545455 0.00000000 0.72727273
## 14 0.12831187 0.40909091 0.36363636 0.45454545
## 15 0.07471107 0.45454545 0.36363636 0.54545455
## 16 0.04953000 0.77272727 0.72727273 0.81818182
## 17 0.02559330 0.72727273 0.00000000 0.81818182
## 18 0.05243972 0.22727273 0.18181818 0.27272727
## 19 0.05216870 0.27272727 0.18181818 0.54545455
## 20 0.08519580 0.04545455 0.00000000 0.09090909
## 21 0.00000000 0.90909091 0.00000000 1.00000000

##  [1] 0.06251321 0.08286881 0.01906748 0.09325747 0.12019307 0.01374770
##  [7] 0.10349650 0.12831187 0.04953000 0.05243972 0.08519580
## [1] 0.003
## [1] "mse 0.000426880520661157"
t = plot_tree(paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.m1.treemix.out"))
##     V1   V2       V3      V4      V5  V6  V7 V8  V9 V10
## 1    0 <NA> NOT_ROOT NOT_MIG NOT_TIP  16 211  1  75   1
## 2    1   V5 NOT_ROOT NOT_MIG     TIP   2  NA NA  NA  NA
## 3    2 <NA> NOT_ROOT NOT_MIG NOT_TIP  76   3  1   1   1
## 4    3   V9 NOT_ROOT NOT_MIG     TIP   2  NA NA  NA  NA
## 5    4   V6 NOT_ROOT NOT_MIG     TIP  52  NA NA  NA  NA
## 6   15   V3 NOT_ROOT NOT_MIG     TIP  32  NA NA  NA  NA
## 7   16 <NA> NOT_ROOT NOT_MIG NOT_TIP  76 172  4   0   2
## 8   31   V1 NOT_ROOT NOT_MIG     TIP 104  NA NA  NA  NA
## 9   32 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 104  2  15   1
## 10  51   V7 NOT_ROOT NOT_MIG     TIP 213  NA NA  NA  NA
## 11  52 <NA> NOT_ROOT NOT_MIG NOT_TIP 213   4  1 136   9
## 12  75   V8 NOT_ROOT NOT_MIG     TIP   0  NA NA  NA  NA
## 13  76 <NA> NOT_ROOT NOT_MIG NOT_TIP 136   2  2  16   6
## 14 103   V4 NOT_ROOT NOT_MIG     TIP 104  NA NA  NA  NA
## 15 104 <NA> NOT_ROOT NOT_MIG NOT_TIP  32  31  1 103   1
## 16 135  V11 NOT_ROOT NOT_MIG     TIP 136  NA NA  NA  NA
## 17 136 <NA> NOT_ROOT NOT_MIG NOT_TIP  52  76  8 135   1
## 18 171   V2 NOT_ROOT NOT_MIG     TIP 172  NA NA  NA  NA
## 19 172 <NA> NOT_ROOT NOT_MIG NOT_TIP  16  32  3 171   1
## 20 211  V10 NOT_ROOT NOT_MIG     TIP   0  NA NA  NA  NA
## 21 213 <NA>     ROOT NOT_MIG NOT_TIP 213  51  1  52  10
## 22 216 <NA> NOT_ROOT     MIG NOT_TIP  52 136 NA  NA  NA
##                                                                                                                                                                                                                                                                    V11
## 1                                                                                                                                                                                                                                (V10:0.0156642,V8:0.132024):0.0214575
## 2                                                                                                                                                                                                                                                         V5:0.0114564
## 3                                                                                                                                                                                                                                (V9:0.031812,V5:0.0114564):0.00582432
## 4                                                                                                                                                                                                                                                          V9:0.031812
## 5                                                                                                                                                                                                                                                        V6:0.00531978
## 6                                                                                                                                                                                                                                                          V3:0.031223
## 7                                                                                                                        ((((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.000670075,(V10:0.0156642,V8:0.132024):0.0214575):0.00740961
## 8                                                                                                                                                                                                                                                          V1:0.045482
## 9                                                                                                                                                                                                        ((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577
## 10                                                                                                                                                                                                                                                        V7:0.0137477
## 11                 (V6:0.00531978,(((V9:0.031812,V5:0.0114564):0.00582432,((((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.000670075,(V10:0.0156642,V8:0.132024):0.0214575):0.00740961):0.0196059,V11:0.0237574):0.0119322):0.0137477
## 12                                                                                                                                                                                                                                                         V8:0.132024
## 13                                                                     ((V9:0.031812,V5:0.0114564):0.00582432,((((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.000670075,(V10:0.0156642,V8:0.132024):0.0214575):0.00740961):0.0196059
## 14                                                                                                                                                                                                                                                        V4:0.0536008
## 15                                                                                                                                                                                                                                (V1:0.045482,V4:0.0536008):0.0126766
## 16                                                                                                                                                                                                                                                       V11:0.0237574
## 17                                           (((V9:0.031812,V5:0.0114564):0.00582432,((((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.000670075,(V10:0.0156642,V8:0.132024):0.0214575):0.00740961):0.0196059,V11:0.0237574):0.0119322
## 18                                                                                                                                                                                                                                                      V2:0.000271025
## 19                                                                                                                                                                          (((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.000670075
## 20                                                                                                                                                                                                                                                       V10:0.0156642
## 21 (V7:0.0137477,(V6:0.00531978,(((V9:0.031812,V5:0.0114564):0.00582432,((((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.000670075,(V10:0.0156642,V8:0.132024):0.0214575):0.00740961):0.0196059,V11:0.0237574):0.0119322):0.0137477);
## 22                                                                                                                                                                                                                                                                <NA>
##             x          y       ymin       ymax
## 1  0.07415296 0.18181818 0.09090909 0.27272727
## 2  0.06256657 0.68181818 0.63636364 0.72727273
## 3  0.05111017 0.72727273 0.63636364 0.81818182
## 4  0.08292217 0.77272727 0.72727273 0.81818182
## 5  0.01906748 0.86363636 0.81818182 0.90909091
## 6  0.09445431 0.40909091 0.36363636 0.45454545
## 7  0.05269546 0.27272727 0.09090909 0.63636364
## 8  0.12138990 0.59090909 0.54545455 0.63636364
## 9  0.06323131 0.45454545 0.36363636 0.63636364
## 10 0.01374770 0.95454545 0.90909091 1.00000000
## 11 0.01374770 0.81818182 0.00000000 0.90909091
## 12 0.12549735 0.13636364 0.09090909 0.18181818
## 13 0.04528585 0.63636364 0.09090909 0.81818182
## 14 0.12950871 0.50000000 0.45454545 0.54545455
## 15 0.07590790 0.54545455 0.45454545 0.63636364
## 16 0.04943735 0.04545455 0.00000000 0.09090909
## 17 0.02567995 0.09090909 0.00000000 0.81818182
## 18 0.05363656 0.31818182 0.27272727 0.36363636
## 19 0.05336553 0.36363636 0.27272727 0.63636364
## 20 0.08981716 0.22727273 0.18181818 0.27272727
## 21 0.00000000 0.90909091 0.00000000 1.00000000
## 22 0.02273023         NA 0.00000000 0.81818182
## [1] "0.752794 0.0137477 0.02567995"

##  [1] 0.06256657 0.08292217 0.01906748 0.09445431 0.12138990 0.01374770
##  [7] 0.12549735 0.12950871 0.04943735 0.05363656 0.08981716
## [1] 0.003
## [1] "mse 0.000426880520661157"
t = plot_tree(paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.m2.treemix.out"))
##     V1   V2       V3      V4      V5  V6  V7 V8  V9 V10
## 1    0 <NA> NOT_ROOT NOT_MIG NOT_TIP  16 211  1  75   1
## 2    1   V5 NOT_ROOT NOT_MIG     TIP   2  NA NA  NA  NA
## 3    2 <NA> NOT_ROOT NOT_MIG NOT_TIP  76   1  1   3   1
## 4    3   V9 NOT_ROOT NOT_MIG     TIP   2  NA NA  NA  NA
## 5    4   V6 NOT_ROOT NOT_MIG     TIP  52  NA NA  NA  NA
## 6   15   V3 NOT_ROOT NOT_MIG     TIP  32  NA NA  NA  NA
## 7   16 <NA> NOT_ROOT NOT_MIG NOT_TIP  76   0  2 172   4
## 8   31   V1 NOT_ROOT NOT_MIG     TIP 104  NA NA  NA  NA
## 9   32 <NA> NOT_ROOT NOT_MIG NOT_TIP 172  15  1 104   2
## 10  51   V7 NOT_ROOT NOT_MIG     TIP 213  NA NA  NA  NA
## 11  52 <NA> NOT_ROOT NOT_MIG NOT_TIP 213   4  1 136   9
## 12  75   V8 NOT_ROOT NOT_MIG     TIP   0  NA NA  NA  NA
## 13  76 <NA> NOT_ROOT NOT_MIG NOT_TIP 136  16  6   2   2
## 14 103   V4 NOT_ROOT NOT_MIG     TIP 104  NA NA  NA  NA
## 15 104 <NA> NOT_ROOT NOT_MIG NOT_TIP  32  31  1 103   1
## 16 135  V11 NOT_ROOT NOT_MIG     TIP 136  NA NA  NA  NA
## 17 136 <NA> NOT_ROOT NOT_MIG NOT_TIP  52 135  1  76   8
## 18 171   V2 NOT_ROOT NOT_MIG     TIP 172  NA NA  NA  NA
## 19 172 <NA> NOT_ROOT NOT_MIG NOT_TIP  16 171  1  32   3
## 20 211  V10 NOT_ROOT NOT_MIG     TIP   0  NA NA  NA  NA
## 21 213 <NA>     ROOT NOT_MIG NOT_TIP 213  52 10  51   1
## 22 263 <NA> NOT_ROOT     MIG NOT_TIP 104 103 NA  NA  NA
## 23 216 <NA> NOT_ROOT     MIG NOT_TIP 136  76 NA  NA  NA
##                                                                                                                                                                                                                                                          V11
## 1                                                                                                                                                                                                                       (V10:0.0116286,V8:0.192552):0.024709
## 2                                                                                                                                                                                                                                              V5:0.00666147
## 3                                                                                                                                                                                                                      (V5:0.00666147,V9:0.052367):0.0102706
## 4                                                                                                                                                                                                                                                V9:0.052367
## 5                                                                                                                                                                                                                                              V6:0.00532122
## 6                                                                                                                                                                                                                                               V3:0.0306167
## 7                                                                                                                        ((V10:0.0116286,V8:0.192552):0.024709,(V2:0,(V3:0.0306167,(V1:0.0451691,V4:0.0541295):0.0130093):0.0105408):0.000836191):0.00956141
## 8                                                                                                                                                                                                                                               V1:0.0451691
## 9                                                                                                                                                                                             (V3:0.0306167,(V1:0.0451691,V4:0.0541295):0.0130093):0.0105408
## 10                                                                                                                                                                                                                                              V7:0.0137485
## 11                 (V6:0.00532122,(V11:0.0239381,(((V10:0.0116286,V8:0.192552):0.024709,(V2:0,(V3:0.0306167,(V1:0.0451691,V4:0.0541295):0.0130093):0.0105408):0.000836191):0.00956141,(V5:0.00666147,V9:0.052367):0.0102706):0.0171301):0.0118456):0.0137485
## 12                                                                                                                                                                                                                                               V8:0.192552
## 13                                                                     (((V10:0.0116286,V8:0.192552):0.024709,(V2:0,(V3:0.0306167,(V1:0.0451691,V4:0.0541295):0.0130093):0.0105408):0.000836191):0.00956141,(V5:0.00666147,V9:0.052367):0.0102706):0.0171301
## 14                                                                                                                                                                                                                                              V4:0.0541295
## 15                                                                                                                                                                                                                     (V1:0.0451691,V4:0.0541295):0.0130093
## 16                                                                                                                                                                                                                                             V11:0.0239381
## 17                                           (V11:0.0239381,(((V10:0.0116286,V8:0.192552):0.024709,(V2:0,(V3:0.0306167,(V1:0.0451691,V4:0.0541295):0.0130093):0.0105408):0.000836191):0.00956141,(V5:0.00666147,V9:0.052367):0.0102706):0.0171301):0.0118456
## 18                                                                                                                                                                                                                                                      V2:0
## 19                                                                                                                                                                         (V2:0,(V3:0.0306167,(V1:0.0451691,V4:0.0541295):0.0130093):0.0105408):0.000836191
## 20                                                                                                                                                                                                                                             V10:0.0116286
## 21 ((V6:0.00532122,(V11:0.0239381,(((V10:0.0116286,V8:0.192552):0.024709,(V2:0,(V3:0.0306167,(V1:0.0451691,V4:0.0541295):0.0130093):0.0105408):0.000836191):0.00956141,(V5:0.00666147,V9:0.052367):0.0102706):0.0171301):0.0118456):0.0137485,V7:0.0137485);
## 22                                                                                                                                                                                                                                                      <NA>
## 23                                                                                                                                                                                                                                                      <NA>
##             x          y       ymin       ymax
## 1  0.07699463 0.72727273 0.63636364 0.81818182
## 2  0.05965629 0.22727273 0.18181818 0.27272727
## 3  0.05299482 0.18181818 0.09090909 0.27272727
## 4  0.08726837 0.13636364 0.09090909 0.18181818
## 5  0.01906972 0.95454545 0.90909091 1.00000000
## 6  0.09427932 0.50000000 0.45454545 0.54545455
## 7  0.05228563 0.63636364 0.27272727 0.81818182
## 8  0.12184102 0.40909091 0.36363636 0.45454545
## 9  0.06366262 0.45454545 0.27272727 0.54545455
## 10 0.01374850 0.04545455 0.00000000 0.09090909
## 11 0.01374850 0.90909091 0.09090909 1.00000000
## 12 0.12880987 0.68181818 0.63636364 0.72727273
## 13 0.04272422 0.27272727 0.09090909 0.81818182
## 14 0.13080142 0.31818182 0.27272727 0.36363636
## 15 0.07667192 0.36363636 0.27272727 0.45454545
## 16 0.04953220 0.86363636 0.81818182 0.90909091
## 17 0.02559410 0.81818182 0.09090909 0.90909091
## 18 0.05312182 0.59090909 0.54545455 0.63636364
## 19 0.05312182 0.54545455 0.27272727 0.63636364
## 20 0.08862323 0.77272727 0.72727273 0.81818182
## 21 0.00000000 0.09090909 0.00000000 1.00000000
## 22 0.09649812         NA 0.27272727 0.36363636
## 23 0.03013852         NA 0.09090909 0.81818182
## [1] "0.366274 0.076671921 0.130801421"
## [1] "0.265289 0.0255941 0.04272422"

##  [1] 0.05965629 0.08726837 0.01906972 0.09427932 0.12184102 0.01374850
##  [7] 0.12880987 0.13080142 0.04953220 0.05312182 0.08862323
## [1] 0.003
## [1] "mse 0.000426880520661157"
t = plot_tree(paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.m3.treemix.out"))
##     V1   V2       V3      V4      V5  V6  V7 V8  V9 V10
## 1    0 <NA> NOT_ROOT NOT_MIG NOT_TIP  16 211  1  75   1
## 2    1   V5 NOT_ROOT NOT_MIG     TIP   2  NA NA  NA  NA
## 3    2 <NA> NOT_ROOT NOT_MIG NOT_TIP  76   1  1   3   1
## 4    3   V9 NOT_ROOT NOT_MIG     TIP   2  NA NA  NA  NA
## 5    4   V6 NOT_ROOT NOT_MIG     TIP  52  NA NA  NA  NA
## 6   15   V3 NOT_ROOT NOT_MIG     TIP  32  NA NA  NA  NA
## 7   16 <NA> NOT_ROOT NOT_MIG NOT_TIP  76   0  2 172   4
## 8   31   V1 NOT_ROOT NOT_MIG     TIP 104  NA NA  NA  NA
## 9   32 <NA> NOT_ROOT NOT_MIG NOT_TIP 172  15  1 171   1
## 10  51   V7 NOT_ROOT NOT_MIG     TIP 213  NA NA  NA  NA
## 11  52 <NA> NOT_ROOT NOT_MIG NOT_TIP 213   4  1 136   9
## 12  75   V8 NOT_ROOT NOT_MIG     TIP   0  NA NA  NA  NA
## 13  76 <NA> NOT_ROOT NOT_MIG NOT_TIP 136  16  6   2   2
## 14 103   V4 NOT_ROOT NOT_MIG     TIP 104  NA NA  NA  NA
## 15 104 <NA> NOT_ROOT NOT_MIG NOT_TIP 172  31  1 103   1
## 16 135  V11 NOT_ROOT NOT_MIG     TIP 136  NA NA  NA  NA
## 17 136 <NA> NOT_ROOT NOT_MIG NOT_TIP  52 135  1  76   8
## 18 171   V2 NOT_ROOT NOT_MIG     TIP  32  NA NA  NA  NA
## 19 172 <NA> NOT_ROOT NOT_MIG NOT_TIP  16 104  2  32   2
## 20 211  V10 NOT_ROOT NOT_MIG     TIP   0  NA NA  NA  NA
## 21 213 <NA>     ROOT NOT_MIG NOT_TIP 213  52 10  51   1
## 22 263 <NA> NOT_ROOT     MIG NOT_TIP 104 103 NA  NA  NA
## 23 216 <NA> NOT_ROOT     MIG NOT_TIP 136  76 NA  NA  NA
## 24 289 <NA> NOT_ROOT     MIG NOT_TIP 104 103 NA  NA  NA
##                                                                                                                                                                                                                                                   V11
## 1                                                                                                                                                                                                               (V10:0.0117506,V8:0.191344):0.0245533
## 2                                                                                                                                                                                                                                        V5:0.0070738
## 3                                                                                                                                                                                                              (V5:0.0070738,V9:0.0489904):0.00987258
## 4                                                                                                                                                                                                                                        V9:0.0489904
## 5                                                                                                                                                                                                                                       V6:0.00532132
## 6                                                                                                                                                                                                                                        V3:0.0684707
## 7                                                                                                                         ((V10:0.0117506,V8:0.191344):0.0245533,((V1:0.0440156,V4:0.0552927):0.0235787,(V3:0.0684707,V2:0):0):0.000833902):0.0094842
## 8                                                                                                                                                                                                                                        V1:0.0440156
## 9                                                                                                                                                                                                                               (V3:0.0684707,V2:0):0
## 10                                                                                                                                                                                                                                       V7:0.0137485
## 11                 (V6:0.00532132,(V11:0.0239382,(((V10:0.0117506,V8:0.191344):0.0245533,((V1:0.0440156,V4:0.0552927):0.0235787,(V3:0.0684707,V2:0):0):0.000833902):0.0094842,(V5:0.0070738,V9:0.0489904):0.00987258):0.0171909):0.0118456):0.0137485
## 12                                                                                                                                                                                                                                        V8:0.191344
## 13                                                                     (((V10:0.0117506,V8:0.191344):0.0245533,((V1:0.0440156,V4:0.0552927):0.0235787,(V3:0.0684707,V2:0):0):0.000833902):0.0094842,(V5:0.0070738,V9:0.0489904):0.00987258):0.0171909
## 14                                                                                                                                                                                                                                       V4:0.0552927
## 15                                                                                                                                                                                                              (V1:0.0440156,V4:0.0552927):0.0235787
## 16                                                                                                                                                                                                                                      V11:0.0239382
## 17                                           (V11:0.0239382,(((V10:0.0117506,V8:0.191344):0.0245533,((V1:0.0440156,V4:0.0552927):0.0235787,(V3:0.0684707,V2:0):0):0.000833902):0.0094842,(V5:0.0070738,V9:0.0489904):0.00987258):0.0171909):0.0118456
## 18                                                                                                                                                                                                                                               V2:0
## 19                                                                                                                                                                          ((V1:0.0440156,V4:0.0552927):0.0235787,(V3:0.0684707,V2:0):0):0.000833902
## 20                                                                                                                                                                                                                                      V10:0.0117506
## 21 ((V6:0.00532132,(V11:0.0239382,(((V10:0.0117506,V8:0.191344):0.0245533,((V1:0.0440156,V4:0.0552927):0.0235787,(V3:0.0684707,V2:0):0):0.000833902):0.0094842,(V5:0.0070738,V9:0.0489904):0.00987258):0.0171909):0.0118456):0.0137485,V7:0.0137485);
## 22                                                                                                                                                                                                                                               <NA>
## 23                                                                                                                                                                                                                                               <NA>
## 24                                                                                                                                                                                                                                               <NA>
##             x          y       ymin       ymax
## 1  0.07682254 0.72727273 0.63636364 0.81818182
## 2  0.05973142 0.22727273 0.18181818 0.27272727
## 3  0.05265762 0.18181818 0.09090909 0.27272727
## 4  0.08677643 0.13636364 0.09090909 0.18181818
## 5  0.01906982 0.95454545 0.90909091 1.00000000
## 6  0.09041198 0.40909091 0.36363636 0.45454545
## 7  0.05226924 0.63636364 0.27272727 0.81818182
## 8  0.12069744 0.59090909 0.54545455 0.63636364
## 9  0.05310314 0.36363636 0.27272727 0.45454545
## 10 0.01374850 0.04545455 0.00000000 0.09090909
## 11 0.01374850 0.90909091 0.09090909 1.00000000
## 12 0.12862167 0.68181818 0.63636364 0.72727273
## 13 0.04278504 0.27272727 0.09090909 0.81818182
## 14 0.13197446 0.50000000 0.45454545 0.54545455
## 15 0.07668184 0.54545455 0.45454545 0.63636364
## 16 0.04953230 0.86363636 0.81818182 0.90909091
## 17 0.02559410 0.81818182 0.09090909 0.90909091
## 18 0.05310314 0.31818182 0.27272727 0.36363636
## 19 0.05310314 0.45454545 0.27272727 0.63636364
## 20 0.08857314 0.77272727 0.72727273 0.81818182
## 21 0.00000000 0.09090909 0.00000000 1.00000000
## 22 0.10884164         NA 0.45454545 0.54545455
## 23 0.03012114         NA 0.09090909 0.81818182
## 24 0.13197446         NA 0.45454545 0.54545455
## [1] "0.581629 0.076681842 0.131974462"
## [1] "0.263339 0.0255941 0.04278504"
## [1] "0.614093 0.076681842 0.131974462"

##  [1] 0.05973142 0.08677643 0.01906982 0.09041198 0.12069744 0.01374850
##  [7] 0.12862167 0.13197446 0.04953230 0.05310314 0.08857314
## [1] 0.003
## [1] "mse 0.000426880520661157"
t = plot_tree(paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.m4.treemix.out"))
##     V1   V2       V3      V4      V5  V6  V7 V8  V9 V10
## 1    0 <NA> NOT_ROOT NOT_MIG NOT_TIP  16 211  1  75   1
## 2    1   V5 NOT_ROOT NOT_MIG     TIP   2  NA NA  NA  NA
## 3    2 <NA> NOT_ROOT NOT_MIG NOT_TIP  76   1  1   3   1
## 4    3   V9 NOT_ROOT NOT_MIG     TIP   2  NA NA  NA  NA
## 5    4   V6 NOT_ROOT NOT_MIG     TIP  52  NA NA  NA  NA
## 6   15   V3 NOT_ROOT NOT_MIG     TIP  32  NA NA  NA  NA
## 7   16 <NA> NOT_ROOT NOT_MIG NOT_TIP  76 172  4   0   2
## 8   31   V1 NOT_ROOT NOT_MIG     TIP 104  NA NA  NA  NA
## 9   32 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 171  1  15   1
## 10  51   V7 NOT_ROOT NOT_MIG     TIP 213  NA NA  NA  NA
## 11  52 <NA> NOT_ROOT NOT_MIG NOT_TIP 213 136  9   4   1
## 12  75   V8 NOT_ROOT NOT_MIG     TIP   0  NA NA  NA  NA
## 13  76 <NA> NOT_ROOT NOT_MIG NOT_TIP 136   2  2  16   6
## 14 103   V4 NOT_ROOT NOT_MIG     TIP 104  NA NA  NA  NA
## 15 104 <NA> NOT_ROOT NOT_MIG NOT_TIP 172  31  1 103   1
## 16 135  V11 NOT_ROOT NOT_MIG     TIP 136  NA NA  NA  NA
## 17 136 <NA> NOT_ROOT NOT_MIG NOT_TIP  52 135  1  76   8
## 18 171   V2 NOT_ROOT NOT_MIG     TIP  32  NA NA  NA  NA
## 19 172 <NA> NOT_ROOT NOT_MIG NOT_TIP  16 104  2  32   2
## 20 211  V10 NOT_ROOT NOT_MIG     TIP   0  NA NA  NA  NA
## 21 213 <NA>     ROOT NOT_MIG NOT_TIP 213  51  1  52  10
## 22 263 <NA> NOT_ROOT     MIG NOT_TIP 104 103 NA  NA  NA
## 23 216 <NA> NOT_ROOT     MIG NOT_TIP 136  76 NA  NA  NA
## 24 289 <NA> NOT_ROOT     MIG NOT_TIP 104 103 NA  NA  NA
## 25 336 <NA> NOT_ROOT     MIG NOT_TIP   2   3 NA  NA  NA
##                                                                                                                                                                                                                                                   V11
## 1                                                                                                                                                                                                               (V10:0.0111877,V8:0.201105):0.0250823
## 2                                                                                                                                                                                                                                       V5:0.00656589
## 3                                                                                                                                                                                                              (V5:0.00656589,V9:0.0510023):0.0105293
## 4                                                                                                                                                                                                                                        V9:0.0510023
## 5                                                                                                                                                                                                                                       V6:0.00532145
## 6                                                                                                                                                                                                                                        V3:0.0688054
## 7                                                                                                                         (((V1:0.0440791,V4:0.0553094):0.0235664,(V2:0,V3:0.0688054):0):0.000855883,(V10:0.0111877,V8:0.201105):0.0250823):0.0092669
## 8                                                                                                                                                                                                                                        V1:0.0440791
## 9                                                                                                                                                                                                                               (V2:0,V3:0.0688054):0
## 10                                                                                                                                                                                                                                       V7:0.0137486
## 11                 ((V11:0.0262863,((V5:0.00656589,V9:0.0510023):0.0105293,(((V1:0.0440791,V4:0.0553094):0.0235664,(V2:0,V3:0.0688054):0):0.000855883,(V10:0.0111877,V8:0.201105):0.0250823):0.0092669):0.0180456):0.0110565,V6:0.00532145):0.0137486
## 12                                                                                                                                                                                                                                        V8:0.201105
## 13                                                                     ((V5:0.00656589,V9:0.0510023):0.0105293,(((V1:0.0440791,V4:0.0553094):0.0235664,(V2:0,V3:0.0688054):0):0.000855883,(V10:0.0111877,V8:0.201105):0.0250823):0.0092669):0.0180456
## 14                                                                                                                                                                                                                                       V4:0.0553094
## 15                                                                                                                                                                                                              (V1:0.0440791,V4:0.0553094):0.0235664
## 16                                                                                                                                                                                                                                      V11:0.0262863
## 17                                           (V11:0.0262863,((V5:0.00656589,V9:0.0510023):0.0105293,(((V1:0.0440791,V4:0.0553094):0.0235664,(V2:0,V3:0.0688054):0):0.000855883,(V10:0.0111877,V8:0.201105):0.0250823):0.0092669):0.0180456):0.0110565
## 18                                                                                                                                                                                                                                               V2:0
## 19                                                                                                                                                                          ((V1:0.0440791,V4:0.0553094):0.0235664,(V2:0,V3:0.0688054):0):0.000855883
## 20                                                                                                                                                                                                                                      V10:0.0111877
## 21 (V7:0.0137486,((V11:0.0262863,((V5:0.00656589,V9:0.0510023):0.0105293,(((V1:0.0440791,V4:0.0553094):0.0235664,(V2:0,V3:0.0688054):0):0.000855883,(V10:0.0111877,V8:0.201105):0.0250823):0.0092669):0.0180456):0.0110565,V6:0.00532145):0.0137486);
## 22                                                                                                                                                                                                                                               <NA>
## 23                                                                                                                                                                                                                                               <NA>
## 24                                                                                                                                                                                                                                               <NA>
## 25                                                                                                                                                                                                                                               <NA>
##             x          y       ymin       ymax
## 1  0.07719982 0.18181818 0.09090909 0.27272727
## 2  0.05994581 0.77272727 0.72727273 0.81818182
## 3  0.05337992 0.72727273 0.63636364 0.81818182
## 4  0.08775739 0.68181818 0.63636364 0.72727273
## 5  0.01907005 0.04545455 0.00000000 0.09090909
## 6  0.09024684 0.31818182 0.27272727 0.36363636
## 7  0.05211752 0.27272727 0.09090909 0.63636364
## 8  0.12061890 0.59090909 0.54545455 0.63636364
## 9  0.05297340 0.36363636 0.27272727 0.45454545
## 10 0.01374860 0.95454545 0.90909091 1.00000000
## 11 0.01374860 0.09090909 0.00000000 0.90909091
## 12 0.12897240 0.13636364 0.09090909 0.18181818
## 13 0.04285062 0.63636364 0.09090909 0.81818182
## 14 0.13184928 0.50000000 0.45454545 0.54545455
## 15 0.07653980 0.54545455 0.45454545 0.63636364
## 16 0.04945274 0.86363636 0.81818182 0.90909091
## 17 0.02480510 0.81818182 0.09090909 0.90909091
## 18 0.05297340 0.40909091 0.36363636 0.45454545
## 19 0.05297340 0.45454545 0.27272727 0.63636364
## 20 0.08838752 0.22727273 0.18181818 0.27272727
## 21 0.00000000 0.90909091 0.00000000 1.00000000
## 22 0.10535600         NA 0.45454545 0.54545455
## 23 0.03048592         NA 0.09090909 0.81818182
## 24 0.13184928         NA 0.45454545 0.54545455
## 25 0.08775739         NA 0.63636364 0.72727273
## [1] "0.520999 0.076539803 0.131849283"
## [1] "0.314804 0.0248051 0.04285062"
## [1] "0.605709 0.076539803 0.131849283"
## [1] "1 0.05337992 0.0877573938029508"

##  [1] 0.05994581 0.08775739 0.01907005 0.09024684 0.12061890 0.01374860
##  [7] 0.12897240 0.13184928 0.04945274 0.05297340 0.08838752
## [1] 0.003
## [1] "mse 0.000426880520661157"
t = plot_tree(paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.m5.treemix.out"))
##     V1   V2       V3      V4      V5  V6  V7 V8  V9 V10
## 1    0 <NA> NOT_ROOT NOT_MIG NOT_TIP  16 211  1  75   1
## 2    1   V5 NOT_ROOT NOT_MIG     TIP   2  NA NA  NA  NA
## 3    2 <NA> NOT_ROOT NOT_MIG NOT_TIP  76   3  1   1   1
## 4    3   V9 NOT_ROOT NOT_MIG     TIP   2  NA NA  NA  NA
## 5    4   V6 NOT_ROOT NOT_MIG     TIP  52  NA NA  NA  NA
## 6   15   V3 NOT_ROOT NOT_MIG     TIP 172  NA NA  NA  NA
## 7   16 <NA> NOT_ROOT NOT_MIG NOT_TIP  76 172  4   0   2
## 8   31   V1 NOT_ROOT NOT_MIG     TIP 104  NA NA  NA  NA
## 9   32 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 171  1 104   2
## 10  51   V7 NOT_ROOT NOT_MIG     TIP 213  NA NA  NA  NA
## 11  52 <NA> NOT_ROOT NOT_MIG NOT_TIP 213   4  1 136   9
## 12  75   V8 NOT_ROOT NOT_MIG     TIP   0  NA NA  NA  NA
## 13  76 <NA> NOT_ROOT NOT_MIG NOT_TIP 136  16  6   2   2
## 14 103   V4 NOT_ROOT NOT_MIG     TIP 104  NA NA  NA  NA
## 15 104 <NA> NOT_ROOT NOT_MIG NOT_TIP  32  31  1 103   1
## 16 135  V11 NOT_ROOT NOT_MIG     TIP 136  NA NA  NA  NA
## 17 136 <NA> NOT_ROOT NOT_MIG NOT_TIP  52 135  1  76   8
## 18 171   V2 NOT_ROOT NOT_MIG     TIP  32  NA NA  NA  NA
## 19 172 <NA> NOT_ROOT NOT_MIG NOT_TIP  16  15  1  32   3
## 20 211  V10 NOT_ROOT NOT_MIG     TIP   0  NA NA  NA  NA
## 21 213 <NA>     ROOT NOT_MIG NOT_TIP 213  52 10  51   1
## 22 263 <NA> NOT_ROOT     MIG NOT_TIP 104 103 NA  NA  NA
## 23 216 <NA> NOT_ROOT     MIG NOT_TIP 136  76 NA  NA  NA
## 24 289 <NA> NOT_ROOT     MIG NOT_TIP 104 103 NA  NA  NA
## 25 336 <NA> NOT_ROOT     MIG NOT_TIP   2   3 NA  NA  NA
## 26 370 <NA> NOT_ROOT     MIG NOT_TIP 136  76 NA  NA  NA
##                                                                                                                                                                                                                                                                      V11
## 1                                                                                                                                                                                                                                  (V10:0.0111015,V8:0.201352):0.0248959
## 2                                                                                                                                                                                                                                                          V5:0.00645114
## 3                                                                                                                                                                                                                                 (V9:0.0516077,V5:0.00645114):0.0105381
## 4                                                                                                                                                                                                                                                           V9:0.0516077
## 5                                                                                                                                                                                                                                                          V6:0.00531978
## 6                                                                                                                                                                                                                                                           V3:0.0634468
## 7                                                                                                                         ((V3:0.0634468,(V2:0.000403751,(V1:0.0441021,V4:0.0553085):0.022421):0.000340551):0.00154311,(V10:0.0111015,V8:0.201352):0.0248959):0.00954277
## 8                                                                                                                                                                                                                                                           V1:0.0441021
## 9                                                                                                                                                                                                      (V2:0.000403751,(V1:0.0441021,V4:0.0553085):0.022421):0.000340551
## 10                                                                                                                                                                                                                                                          V7:0.0137477
## 11                 (V6:0.00531978,(V11:0.0264164,(((V3:0.0634468,(V2:0.000403751,(V1:0.0441021,V4:0.0553085):0.022421):0.000340551):0.00154311,(V10:0.0111015,V8:0.201352):0.0248959):0.00954277,(V9:0.0516077,V5:0.00645114):0.0105381):0.0181715):0.0110111):0.0137477
## 12                                                                                                                                                                                                                                                           V8:0.201352
## 13                                                                     (((V3:0.0634468,(V2:0.000403751,(V1:0.0441021,V4:0.0553085):0.022421):0.000340551):0.00154311,(V10:0.0111015,V8:0.201352):0.0248959):0.00954277,(V9:0.0516077,V5:0.00645114):0.0105381):0.0181715
## 14                                                                                                                                                                                                                                                          V4:0.0553085
## 15                                                                                                                                                                                                                                  (V1:0.0441021,V4:0.0553085):0.022421
## 16                                                                                                                                                                                                                                                         V11:0.0264164
## 17                                           (V11:0.0264164,(((V3:0.0634468,(V2:0.000403751,(V1:0.0441021,V4:0.0553085):0.022421):0.000340551):0.00154311,(V10:0.0111015,V8:0.201352):0.0248959):0.00954277,(V9:0.0516077,V5:0.00645114):0.0105381):0.0181715):0.0110111
## 18                                                                                                                                                                                                                                                        V2:0.000403751
## 19                                                                                                                                                                           (V3:0.0634468,(V2:0.000403751,(V1:0.0441021,V4:0.0553085):0.022421):0.000340551):0.00154311
## 20                                                                                                                                                                                                                                                         V10:0.0111015
## 21 ((V6:0.00531978,(V11:0.0264164,(((V3:0.0634468,(V2:0.000403751,(V1:0.0441021,V4:0.0553085):0.022421):0.000340551):0.00154311,(V10:0.0111015,V8:0.201352):0.0248959):0.00954277,(V9:0.0516077,V5:0.00645114):0.0105381):0.0181715):0.0110111):0.0137477,V7:0.0137477);
## 22                                                                                                                                                                                                                                                                  <NA>
## 23                                                                                                                                                                                                                                                                  <NA>
## 24                                                                                                                                                                                                                                                                  <NA>
## 25                                                                                                                                                                                                                                                                  <NA>
## 26                                                                                                                                                                                                                                                                  <NA>
##             x          y       ymin       ymax
## 1  0.07736893 0.36363636 0.27272727 0.45454545
## 2  0.05991950 0.13636364 0.09090909 0.18181818
## 3  0.05346836 0.18181818 0.09090909 0.27272727
## 4  0.08785954 0.22727273 0.18181818 0.27272727
## 5  0.01906748 0.95454545 0.90909091 1.00000000
## 6  0.09100414 0.77272727 0.72727273 0.81818182
## 7  0.05247303 0.45454545 0.27272727 0.81818182
## 8  0.12087979 0.59090909 0.54545455 0.63636364
## 9  0.05435669 0.63636364 0.45454545 0.72727273
## 10 0.01374770 0.04545455 0.00000000 0.09090909
## 11 0.01374770 0.90909091 0.09090909 1.00000000
## 12 0.12915199 0.31818182 0.27272727 0.36363636
## 13 0.04293026 0.27272727 0.09090909 0.81818182
## 14 0.13208616 0.50000000 0.45454545 0.54545455
## 15 0.07677769 0.54545455 0.45454545 0.63636364
## 16 0.04944041 0.86363636 0.81818182 0.90909091
## 17 0.02475880 0.81818182 0.09090909 0.90909091
## 18 0.05469824 0.68181818 0.63636364 0.72727273
## 19 0.05401614 0.72727273 0.45454545 0.81818182
## 20 0.08847043 0.40909091 0.36363636 0.45454545
## 21 0.00000000 0.09090909 0.00000000 1.00000000
## 22 0.10434659         NA 0.45454545 0.54545455
## 23 0.03040578         NA 0.09090909 0.81818182
## 24 0.13208616         NA 0.45454545 0.54545455
## 25 0.08785954         NA 0.09090909 0.18181818
## 26 0.04293026         NA 0.09090909 0.81818182
## [1] "0.498457 0.076777691 0.132086161"
## [1] "0.310761 0.0247588 0.04293026"
## [1] "0.67518 0.076777691 0.132086161"
## [1] "1 0.05346836 0.0878595404120696"
## [1] "0.619417 0.0247588 0.04293026"

##  [1] 0.05991950 0.08785954 0.01906748 0.09100414 0.12087979 0.01374770
##  [7] 0.12915199 0.13208616 0.04944041 0.05469824 0.08847043
## [1] 0.003
## [1] "mse 0.000426880520661157"
t = plot_tree(paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.m6.treemix.out"))
##     V1   V2       V3      V4      V5  V6  V7 V8 V9 V10
## 1    0 <NA> NOT_ROOT NOT_MIG NOT_TIP  16 211  1 75   1
## 2    1   V5 NOT_ROOT NOT_MIG     TIP   2  NA NA NA  NA
## 3    2 <NA> NOT_ROOT NOT_MIG NOT_TIP  76   3  1  1   1
## 4    3   V9 NOT_ROOT NOT_MIG     TIP   2  NA NA NA  NA
## 5    4   V6 NOT_ROOT NOT_MIG     TIP  52  NA NA NA  NA
## 6   15   V3 NOT_ROOT NOT_MIG     TIP  32  NA NA NA  NA
## 7   16 <NA> NOT_ROOT NOT_MIG NOT_TIP  76 172  4  0   2
## 8   31   V1 NOT_ROOT NOT_MIG     TIP 104  NA NA NA  NA
## 9   32 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 171  1 15   1
## 10  51   V7 NOT_ROOT NOT_MIG     TIP 213  NA NA NA  NA
## 11  52 <NA> NOT_ROOT NOT_MIG NOT_TIP 213 136  9  4   1
## 12  75   V8 NOT_ROOT NOT_MIG     TIP   0  NA NA NA  NA
## 13  76 <NA> NOT_ROOT NOT_MIG NOT_TIP 136  16  6  2   2
## 14 103   V4 NOT_ROOT NOT_MIG     TIP 104  NA NA NA  NA
## 15 104 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 103  1 31   1
## 16 135  V11 NOT_ROOT NOT_MIG     TIP 136  NA NA NA  NA
## 17 136 <NA> NOT_ROOT NOT_MIG NOT_TIP  52 135  1 76   8
## 18 171   V2 NOT_ROOT NOT_MIG     TIP  32  NA NA NA  NA
## 19 172 <NA> NOT_ROOT NOT_MIG NOT_TIP  16 104  2 32   2
## 20 211  V10 NOT_ROOT NOT_MIG     TIP   0  NA NA NA  NA
## 21 213 <NA>     ROOT NOT_MIG NOT_TIP 213  51  1 52  10
## 22 263 <NA> NOT_ROOT     MIG NOT_TIP 104 103 NA NA  NA
## 23 216 <NA> NOT_ROOT     MIG NOT_TIP 136  76 NA NA  NA
## 24 291 <NA> NOT_ROOT     MIG NOT_TIP 104 103 NA NA  NA
## 25 336 <NA> NOT_ROOT     MIG NOT_TIP   2   3 NA NA  NA
## 26 370 <NA> NOT_ROOT     MIG NOT_TIP 136  76 NA NA  NA
## 27 501 <NA> NOT_ROOT     MIG NOT_TIP  32  15 NA NA  NA
##                                                                                                                                                                                                                                                                     V11
## 1                                                                                                                                                                                                                                  (V10:0.0108221,V8:0.205125):0.025204
## 2                                                                                                                                                                                                                                                          V5:0.0049158
## 3                                                                                                                                                                                                                                 (V9:0.0641069,V5:0.0049158):0.0120451
## 4                                                                                                                                                                                                                                                          V9:0.0641069
## 5                                                                                                                                                                                                                                                         V6:0.00531979
## 6                                                                                                                                                                                                                                                          V3:0.0604399
## 7                                                                                                                        (((V4:0.0552339,V1:0.0438529):0.0224576,(V2:0.000582587,V3:0.0604399):2.82921e-05):0.00182254,(V10:0.0108221,V8:0.205125):0.025204):0.00976326
## 8                                                                                                                                                                                                                                                          V1:0.0438529
## 9                                                                                                                                                                                                                             (V2:0.000582587,V3:0.0604399):2.82921e-05
## 10                                                                                                                                                                                                                                                         V7:0.0137477
## 11                 ((V11:0.0266826,((((V4:0.0552339,V1:0.0438529):0.0224576,(V2:0.000582587,V3:0.0604399):2.82921e-05):0.00182254,(V10:0.0108221,V8:0.205125):0.025204):0.00976326,(V9:0.0641069,V5:0.0049158):0.0120451):0.0180575):0.0108971,V6:0.00531979):0.0137477
## 12                                                                                                                                                                                                                                                          V8:0.205125
## 13                                                                     ((((V4:0.0552339,V1:0.0438529):0.0224576,(V2:0.000582587,V3:0.0604399):2.82921e-05):0.00182254,(V10:0.0108221,V8:0.205125):0.025204):0.00976326,(V9:0.0641069,V5:0.0049158):0.0120451):0.0180575
## 14                                                                                                                                                                                                                                                         V4:0.0552339
## 15                                                                                                                                                                                                                                (V4:0.0552339,V1:0.0438529):0.0224576
## 16                                                                                                                                                                                                                                                        V11:0.0266826
## 17                                           (V11:0.0266826,((((V4:0.0552339,V1:0.0438529):0.0224576,(V2:0.000582587,V3:0.0604399):2.82921e-05):0.00182254,(V10:0.0108221,V8:0.205125):0.025204):0.00976326,(V9:0.0641069,V5:0.0049158):0.0120451):0.0180575):0.0108971
## 18                                                                                                                                                                                                                                                       V2:0.000582587
## 19                                                                                                                                                                         ((V4:0.0552339,V1:0.0438529):0.0224576,(V2:0.000582587,V3:0.0604399):2.82921e-05):0.00182254
## 20                                                                                                                                                                                                                                                        V10:0.0108221
## 21 (V7:0.0137477,((V11:0.0266826,((((V4:0.0552339,V1:0.0438529):0.0224576,(V2:0.000582587,V3:0.0604399):2.82921e-05):0.00182254,(V10:0.0108221,V8:0.205125):0.025204):0.00976326,(V9:0.0641069,V5:0.0049158):0.0120451):0.0180575):0.0108971,V6:0.00531979):0.0137477);
## 22                                                                                                                                                                                                                                                                 <NA>
## 23                                                                                                                                                                                                                                                                 <NA>
## 24                                                                                                                                                                                                                                                                 <NA>
## 25                                                                                                                                                                                                                                                                 <NA>
## 26                                                                                                                                                                                                                                                                 <NA>
## 27                                                                                                                                                                                                                                                                 <NA>
##             x          y       ymin       ymax
## 1  0.07766955 0.36363636 0.27272727 0.45454545
## 2  0.05966319 0.13636364 0.09090909 0.18181818
## 3  0.05474739 0.18181818 0.09090909 0.27272727
## 4  0.09029900 0.22727273 0.18181818 0.27272727
## 5  0.01906749 0.04545455 0.00000000 0.09090909
## 6  0.09153321 0.50000000 0.45454545 0.54545455
## 7  0.05246555 0.45454545 0.27272727 0.81818182
## 8  0.12059859 0.68181818 0.63636364 0.72727273
## 9  0.05431638 0.54545455 0.45454545 0.63636364
## 10 0.01374770 0.95454545 0.90909091 1.00000000
## 11 0.01374770 0.09090909 0.00000000 0.90909091
## 12 0.12946881 0.31818182 0.27272727 0.36363636
## 13 0.04270229 0.27272727 0.09090909 0.81818182
## 14 0.13197962 0.77272727 0.72727273 0.81818182
## 15 0.07674569 0.72727273 0.63636364 0.81818182
## 16 0.04940283 0.86363636 0.81818182 0.90909091
## 17 0.02464480 0.81818182 0.09090909 0.90909091
## 18 0.05480875 0.59090909 0.54545455 0.63636364
## 19 0.05428809 0.63636364 0.45454545 0.81818182
## 20 0.08849165 0.40909091 0.36363636 0.45454545
## 21 0.00000000 0.90909091 0.00000000 1.00000000
## 22 0.11787899         NA 0.63636364 0.72727273
## 23 0.03050944         NA 0.09090909 0.81818182
## 24 0.13197962         NA 0.63636364 0.72727273
## 25 0.09029900         NA 0.09090909 0.18181818
## 26 0.04270229         NA 0.09090909 0.81818182
## 27 0.05999265         NA 0.45454545 0.54545455
## [1] "0.744711 0.07674569 0.13197962"
## [1] "0.324776 0.0246448 0.04270229"
## [1] "0.768226 0.07674569 0.13197962"
## [1] "1 0.05474739 0.0902990037952442"
## [1] "0.574053 0.0246448 0.04270229"
## [1] "0.152519 0.0543163821 0.09153320730176"

##  [1] 0.05966319 0.09029900 0.01906749 0.09153321 0.12059859 0.01374770
##  [7] 0.12946881 0.13197962 0.04940283 0.05480875 0.08849165
## [1] 0.003
## [1] "mse 0.000426880520661157"

Along with the treemix software, are distribution the software threepop and fourpop. These measure f3 and f4 statistics which test for treeness in population trees.

The three-population test is of the form f3(A;B;C), where a significantly negative value of the f3 statistic implies that population A is admixed. The output is four columns: populations | f3 statistic | standard error | Z-score

#threepop -i ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix.gz -k 500 | grep "^V" > ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix.3pop.tab

threepop \
   -i /Users/afeurtey/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp.high_anc_coef_snmf.treemix.gz \
   -k 500 | \
   grep "^V" > \
   /Users/afeurtey/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp.high_anc_coef_snmf.treemix.3pop.tab
   
#${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix.3pop.tab
temp = read_delim(paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.treemix.3pop.tab"), delim = " ", 
           col_names = c("populations", "f3 statistic", "standard error", "Z-score")) %>%
  filter(`f3 statistic` < 0)

kable(temp)
populations f3 statistic standard error Z-score
V2;V1,V11 -0.0015012 0.0008542 -1.757460
V2;V1,V5 -0.0003266 0.0006020 -0.542508
V2;V1,V6 -0.0007156 0.0007107 -1.006930
V2;V1,V7 -0.0012880 0.0005334 -2.414910
V2;V1,V8 -0.0010022 0.0007398 -1.354720
V2;V11,V3 -0.0008395 0.0007929 -1.058810
V2;V11,V4 -0.0007121 0.0009595 -0.742185
V2;V3,V5 -0.0006051 0.0005058 -1.196310
V2;V3,V6 -0.0010501 0.0008162 -1.286560
V2;V3,V7 -0.0017748 0.0006643 -2.671630
V2;V4,V7 -0.0005809 0.0006630 -0.876153
V2;V4,V8 -0.0004811 0.0008289 -0.580392
separate(temp, col = populations, into = c("Introgressed_pop", "Pop1", "Pop2"), remove = F) %>%
  ggplot(aes(x = Pop1, y = Pop2, fill = `Z-score`)) +
    geom_tile() + 
    theme_cowplot()

Split tree

threshold_per_pop = 6
threshold_per_country = 20

#Subsample
temp = Zt_meta %>%
  #filter(!(ID_file %in% filtered_samples$ID_file)) %>% 
  #unite(Country, Region, col = "Country2", remove = F) %>%
  #mutate(Country_for_filter = ifelse(Country == "USA", Country2, Country)) %>%
  mutate(x = round(Latitude, 1), y = round(Longitude, 1)) %>%
  unite(Coord_for_filter, x, y, sep = ";", remove = F) %>%
  group_by(Coord_for_filter) %>% 
  dplyr::mutate(Nb_per_coord = n()) %>% 
  group_by(Country) %>% 
  dplyr::mutate(Nb_per_country= n())
  
small_pop = temp %>%
  filter(Nb_per_coord <= threshold_per_pop) %>%
  filter(Nb_per_country <= threshold_per_country)

temp2 = bind_rows(temp %>%
                    filter(!(ID_file %in% small_pop$ID_file))%>%
                    filter(Nb_per_coord > threshold_per_pop) %>% 
                    group_by(Coord_for_filter) %>%
                    dplyr::sample_n(threshold_per_pop),
                  temp %>%
                    filter(!(ID_file %in% small_pop$ID_file))%>%
                    filter(Nb_per_coord <= threshold_per_pop)) %>% 
        ungroup() %>%
        group_by(Country) %>% 
        dplyr::mutate(Nb_per_country= n())

Zt_meta_even_sampling = bind_rows(temp2 %>% 
                    filter(Nb_per_country > threshold_per_country) %>%
                    dplyr::sample_n(threshold_per_country),
                  temp2 %>% 
                    filter(Nb_per_country <= threshold_per_country)) %>%
  bind_rows(., small_pop) %>%
  ungroup()


write_tsv(Zt_meta_even_sampling %>% dplyr::select(ID_file), 
          paste0(PopStr_dir, "List_samples_even_sampling.txt"),
          col_names = F)


ggplot() +
  geom_bar(data = temp %>% group_by(Continent, Country) %>% dplyr::count(),
           aes(x= Country, y = n, fill = Continent), alpha = 0.4, stat = "identity") +
  geom_bar(data = Zt_meta_even_sampling %>% group_by(Continent, Country) %>% dplyr::count(),
           aes(x= Country, y = n, fill = Continent), alpha = 0.4, stat = "identity") +
  Fill_Continent + theme_bw() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

head ${POPSTR}List_samples_even_sampling.txt

vcftools --vcf ${VCFDIR}$VCFNAME.recode.vcf \
  --keep ${POPSTR}List_samples_even_sampling.txt \
  --remove-filtered-all --extract-FORMAT-info GT \
  --max-missing 1.0 --min-alleles 2 --max-alleles 2 \
  --maf 0.05 \
  --out ${POPSTR}$VCFNAME.even_sampling


cat  ${POPSTR}$VCFNAME.even_sampling.GT.FORMAT | cut -f 3- \
    > ${POPSTR}$VCFNAME.even_sampling.GT.FORMAT2
cat  ${POPSTR}$VCFNAME.even_sampling.GT.FORMAT | cut -f 1,2 \
    > ${POPSTR}$VCFNAME.even_sampling.GT.FORMAT.pos
head -n1 ${POPSTR}$VCFNAME.even_sampling.GT.FORMAT2 | gsed "s/\t/\n/g"  \
    > ${POPSTR}$VCFNAME.even_sampling.ind
gsed "s/\t//g"  ${POPSTR}$VCFNAME.even_sampling.GT.FORMAT2 | tail -n +2 \
    > ${POPSTR}$VCFNAME.even_sampling.geno
    
datamash transpose < ${POPSTR}$VCFNAME.even_sampling.GT.FORMAT2  |  sed 's/^/>/' | sed 's/\t/\n/' | sed 's/\t//g' | cut -c -150 > ${POPSTR}$VCFNAME.even_sampling.fasta



Basic statistics: diversity and differenciation


Summary statistics of diversity per clusters

#rsync -avP  alice@130.125.25.244:/data2/alice/WW_project/3_Sumstats_demography/Sample_list_V*_*_diversity_pi.tsv \
#    ~/Documents/Postdoc_Bruce/Projects/WW_project/3_Sumstats_demography/

echo -e "Subset_samples\tChromosome\tStart\tStop\tPi\tTajimaD\tTheta" > \
  ../3_Sumstats_demography/Diversity_per_cluster.tsv
cat ../3_Sumstats_demography/Sample_list_V*_*_diversity_pi.tsv >>  \
  ../3_Sumstats_demography/Diversity_per_cluster.tsv
ordering_table = tibble(Cluster = c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11"),
                        Order_tree = c(4, 10, 9, 6, 7, 5, 1, 3, 2, 11, 8))

diversity_values = read_tsv(paste0(Sumstats_dir, "Diversity_per_cluster.tsv"), na = "nan")

temp = diversity_values %>% 
  dplyr::select(-Theta) %>%
  mutate(Subset_samples = str_remove(Subset_samples, "Sample_list_")) %>%
  separate(Subset_samples, into = c("Cluster", "Rep")) %>%
  group_by(Cluster, Chromosome, Start, Stop) %>%
  dplyr::summarize(Pi = mean(Pi, na.rm = T), TajimaD = mean(TajimaD, na.rm = T)) %>%
  pivot_longer(cols = c(Pi, TajimaD), names_to = "Estimate", values_to = "Value") %>%
  left_join(., ordering_table) %>%
  arrange(Order_tree) 

median_values = ungroup(temp) %>%
  group_by(Estimate) %>%
  dplyr::summarize(average = median(Value, na.rm = T))

pi_data = temp %>%
  filter(Estimate == "Pi") %>%
  filter(!is.na(Cluster)) %>%
  group_by(Cluster) %>%
  dplyr::mutate(cluster_avg = median(Value, na.rm = T))

p1 = ggplot(pi_data, aes(x = reorder(Cluster, -Order_tree), y = Value, fill = Cluster))  +
  geom_boxplot(outlier.shape = NA) + 
  #facet_wrap(vars(Estimate), scales = "free") +
  coord_flip() + ylim(c(-0.001, 0.065)) +
  labs(y = "Diversity (pi)") +
  theme(legend.position = "None") +
  geom_hline(yintercept = pull(median_values %>% filter(Estimate == "Pi") %>% dplyr::select(average)),
             linetype = "dashed")


p2 = temp %>%
  filter(Estimate == "TajimaD") %>%
  ggplot(aes(x = reorder(Cluster, -Order_tree), y = Value, fill = Cluster)) +
  geom_violin() +
  geom_boxplot(width=.2, fill = "white", outlier.shape = NA) + 
  #facet_wrap(vars(Estimate), scales = "free") +
  coord_flip() + 
  labs(y = "Tajima's D", x = "") +
  theme_cowplot() +
  theme(legend.position = "None") +
  geom_hline(yintercept = pull(median_values %>% filter(Estimate == "TajimaD") %>% dplyr::select(average)),
             linetype = "dashed")

cowplot::plot_grid(p1, p2, ncol = 2)

#One-way ANOVA with blocks
##Define linear model

model = lm(Value ~ Cluster,
          data=pi_data)
summary(model)   ### Will show overall p-value and r-squared
## 
## Call:
## lm(formula = Value ~ Cluster, data = pi_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.016725 -0.008952 -0.004141  0.005035  0.114297 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.027e-02  7.997e-05 128.458  < 2e-16 ***
## ClusterV10   5.996e-03  1.131e-04  53.023  < 2e-16 ***
## ClusterV11  -5.197e-03  1.131e-04 -45.952  < 2e-16 ***
## ClusterV2   -8.199e-04  1.131e-04  -7.250 4.17e-13 ***
## ClusterV3    3.654e-03  1.131e-04  32.312  < 2e-16 ***
## ClusterV4   -1.250e-04  1.131e-04  -1.105    0.269    
## ClusterV5   -3.679e-03  1.131e-04 -32.533  < 2e-16 ***
## ClusterV6    5.228e-03  1.131e-04  46.231  < 2e-16 ***
## ClusterV7    3.635e-03  1.131e-04  32.141  < 2e-16 ***
## ClusterV8    2.284e-03  1.131e-04  20.200  < 2e-16 ***
## ClusterV9    6.452e-03  1.131e-04  57.055  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01314 on 297099 degrees of freedom
## Multiple R-squared:  0.07332,    Adjusted R-squared:  0.07329 
## F-statistic:  2351 on 10 and 297099 DF,  p-value: < 2.2e-16
##Conduct analysis of variance
Anova(model,type = "II")  
## Anova Table (Type II tests)
## 
## Response: Value
##           Sum Sq     Df F value    Pr(>F)    
## Cluster    4.060     10  2350.7 < 2.2e-16 ***
## Residuals 51.315 297099                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model)
## 
## Call:
## lm(formula = Value ~ Cluster, data = pi_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.016725 -0.008952 -0.004141  0.005035  0.114297 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.027e-02  7.997e-05 128.458  < 2e-16 ***
## ClusterV10   5.996e-03  1.131e-04  53.023  < 2e-16 ***
## ClusterV11  -5.197e-03  1.131e-04 -45.952  < 2e-16 ***
## ClusterV2   -8.199e-04  1.131e-04  -7.250 4.17e-13 ***
## ClusterV3    3.654e-03  1.131e-04  32.312  < 2e-16 ***
## ClusterV4   -1.250e-04  1.131e-04  -1.105    0.269    
## ClusterV5   -3.679e-03  1.131e-04 -32.533  < 2e-16 ***
## ClusterV6    5.228e-03  1.131e-04  46.231  < 2e-16 ***
## ClusterV7    3.635e-03  1.131e-04  32.141  < 2e-16 ***
## ClusterV8    2.284e-03  1.131e-04  20.200  < 2e-16 ***
## ClusterV9    6.452e-03  1.131e-04  57.055  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01314 on 297099 degrees of freedom
## Multiple R-squared:  0.07332,    Adjusted R-squared:  0.07329 
## F-statistic:  2351 on 10 and 297099 DF,  p-value: < 2.2e-16
hist(residuals(model), col="darkgray")

#Post-hoc analysis:  mean separation tests
marginal = lsmeans(model, ~ Cluster)

pairs(marginal, adjust="tukey")
##  contrast   estimate       SE     df  t.ratio p.value
##  V1 - V10  -6.00e-03 0.000113 297099  -53.023  <.0001
##  V1 - V11   5.20e-03 0.000113 297099   45.952  <.0001
##  V1 - V2    8.20e-04 0.000113 297099    7.250  <.0001
##  V1 - V3   -3.65e-03 0.000113 297099  -32.312  <.0001
##  V1 - V4    1.25e-04 0.000113 297099    1.105  0.9907
##  V1 - V5    3.68e-03 0.000113 297099   32.533  <.0001
##  V1 - V6   -5.23e-03 0.000113 297099  -46.231  <.0001
##  V1 - V7   -3.63e-03 0.000113 297099  -32.141  <.0001
##  V1 - V8   -2.28e-03 0.000113 297099  -20.200  <.0001
##  V1 - V9   -6.45e-03 0.000113 297099  -57.055  <.0001
##  V10 - V11  1.12e-02 0.000113 297099   98.975  <.0001
##  V10 - V2   6.82e-03 0.000113 297099   60.273  <.0001
##  V10 - V3   2.34e-03 0.000113 297099   20.711  <.0001
##  V10 - V4   6.12e-03 0.000113 297099   54.128  <.0001
##  V10 - V5   9.68e-03 0.000113 297099   85.556  <.0001
##  V10 - V6   7.68e-04 0.000113 297099    6.793  <.0001
##  V10 - V7   2.36e-03 0.000113 297099   20.882  <.0001
##  V10 - V8   3.71e-03 0.000113 297099   32.824  <.0001
##  V10 - V9  -4.56e-04 0.000113 297099   -4.032  0.0027
##  V11 - V2  -4.38e-03 0.000113 297099  -38.702  <.0001
##  V11 - V3  -8.85e-03 0.000113 297099  -78.264  <.0001
##  V11 - V4  -5.07e-03 0.000113 297099  -44.847  <.0001
##  V11 - V5  -1.52e-03 0.000113 297099  -13.419  <.0001
##  V11 - V6  -1.04e-02 0.000113 297099  -92.183  <.0001
##  V11 - V7  -8.83e-03 0.000113 297099  -78.093  <.0001
##  V11 - V8  -7.48e-03 0.000113 297099  -66.152  <.0001
##  V11 - V9  -1.16e-02 0.000113 297099 -103.007  <.0001
##  V2 - V3   -4.47e-03 0.000113 297099  -39.562  <.0001
##  V2 - V4   -6.95e-04 0.000113 297099   -6.145  <.0001
##  V2 - V5    2.86e-03 0.000113 297099   25.282  <.0001
##  V2 - V6   -6.05e-03 0.000113 297099  -53.481  <.0001
##  V2 - V7   -4.45e-03 0.000113 297099  -39.391  <.0001
##  V2 - V8   -3.10e-03 0.000113 297099  -27.450  <.0001
##  V2 - V9   -7.27e-03 0.000113 297099  -64.305  <.0001
##  V3 - V4    3.78e-03 0.000113 297099   33.417  <.0001
##  V3 - V5    7.33e-03 0.000113 297099   64.844  <.0001
##  V3 - V6   -1.57e-03 0.000113 297099  -13.919  <.0001
##  V3 - V7    1.93e-05 0.000113 297099    0.171  1.0000
##  V3 - V8    1.37e-03 0.000113 297099   12.112  <.0001
##  V3 - V9   -2.80e-03 0.000113 297099  -24.743  <.0001
##  V4 - V5    3.55e-03 0.000113 297099   31.428  <.0001
##  V4 - V6   -5.35e-03 0.000113 297099  -47.335  <.0001
##  V4 - V7   -3.76e-03 0.000113 297099  -33.246  <.0001
##  V4 - V8   -2.41e-03 0.000113 297099  -21.305  <.0001
##  V4 - V9   -6.58e-03 0.000113 297099  -58.160  <.0001
##  V5 - V6   -8.91e-03 0.000113 297099  -78.763  <.0001
##  V5 - V7   -7.31e-03 0.000113 297099  -64.674  <.0001
##  V5 - V8   -5.96e-03 0.000113 297099  -52.732  <.0001
##  V5 - V9   -1.01e-02 0.000113 297099  -89.587  <.0001
##  V6 - V7    1.59e-03 0.000113 297099   14.090  <.0001
##  V6 - V8    2.94e-03 0.000113 297099   26.031  <.0001
##  V6 - V9   -1.22e-03 0.000113 297099  -10.824  <.0001
##  V7 - V8    1.35e-03 0.000113 297099   11.941  <.0001
##  V7 - V9   -2.82e-03 0.000113 297099  -24.914  <.0001
##  V8 - V9   -4.17e-03 0.000113 297099  -36.855  <.0001
## 
## P value adjustment: tukey method for comparing a family of 11 estimates
CLD = cld(marginal,
          alpha   = 0.05,
          Letters = letters,  ### Use lower-case letters for .group
          adjust  = "tukey")  ### Tukey-adjusted p-values

CLD
##  Cluster  lsmean    SE     df lower.CL upper.CL .group    
##  V11     0.00508 8e-05 297099  0.00485  0.00530  a        
##  V5      0.00659 8e-05 297099  0.00637  0.00682   b       
##  V2      0.00945 8e-05 297099  0.00923  0.00968    c      
##  V4      0.01015 8e-05 297099  0.00992  0.01037     d     
##  V1      0.01027 8e-05 297099  0.01005  0.01050     d     
##  V8      0.01256 8e-05 297099  0.01233  0.01278      e    
##  V7      0.01391 8e-05 297099  0.01368  0.01413       f   
##  V3      0.01393 8e-05 297099  0.01370  0.01415       f   
##  V6      0.01550 8e-05 297099  0.01527  0.01573        g  
##  V10     0.01627 8e-05 297099  0.01604  0.01650         h 
##  V9      0.01672 8e-05 297099  0.01650  0.01695          i
## 
## Confidence level used: 0.95 
## Conf-level adjustment: sidak method for 11 estimates 
## P value adjustment: tukey method for comparing a family of 11 estimates 
## significance level used: alpha = 0.05 
## NOTE: Compact letter displays can be misleading
##       because they show NON-findings rather than findings.
##       Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.
CLD$.group=gsub(" ", "", CLD$.group)

### Plot
pi_ave = pull(median_values %>% filter(Estimate == "Pi") %>% dplyr::select(average))

p1 = ggplot(pi_data, aes(x = reorder(Cluster, -Order_tree), y = Value))  +
  coord_flip() +
  geom_boxplot(outlier.shape = NA, alpha = .4, color = "grey") + 
  geom_segment(aes(x = Cluster, xend = Cluster,
        y = pi_ave, yend = cluster_avg, color = Cluster), size = 0.8) +
  #geom_jitter(size = 1.5, alpha = 0.2, width = 0.2) +
  geom_hline(aes(yintercept = pi_ave), color = "gray20", size = 0.6) +
  geom_point(aes(color = Cluster, x = Cluster, y = cluster_avg), size = 3) + 
  #stat_summary(aes(color = Cluster), fun = mean, geom = "point", size = 2) +
  theme_cowplot() +
    theme(legend.position = "none",
          axis.text.x = element_text(angle = 40, hjust = 1)) +
  labs (x = "", y = "Diversity (pi)") +
  ylim(c(-0.001, 0.055)) +
  geom_text(data = CLD, aes(x = Cluster, label = .group, y = 0.054), color   = "black")

cowplot::plot_grid(p1, p2, ncol = 2)

Variant effects

#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/2_Population_structure/Sample_list_V*.ann.tsv ~/Documents/Postdoc_Bruce/Projects/WW_project/3_Sumstats_demography/

list_snpEff = list()
for (K in c(1:chosen_K)){
#for (K in c(1:3)){
  print(K)
snpEff_per_cluster = read_tsv(paste0(Sumstats_dir, "Sample_list_V", K, ".ann.tsv"), col_names = c("CHR", "BP", "REF", "ALT", "ANN"))%>%
  separate(ANN, sep = ",", 
           into = c("ANN1", "ANN2", "ANN3", "ANN4", "ANN5", "ANN6", "ANN7", "ANN8", "ANN9", "ANN10", "ANN11",
                    "ANN12", "ANN13", "ANN14", "ANN15", "ANN16", "ANN17"), 
           extra = "warn") %>%
  pivot_longer(cols = c(-CHR, -BP, -REF, -ALT), names_to = "ANN", values_to = "Annotation") %>%
  filter(!is.na(Annotation)) %>%
  separate(Annotation, into = c("ALT2", "Variant_type", "Effect_strength", "Gene", "Rest"), sep = "\\|", extra = "merge")%>% 
  dplyr::mutate(Ranked_effect_strength = Effect_strength)
snpEff_per_cluster$Ranked_effect_strength[snpEff_per_cluster$Ranked_effect_strength == "HIGH"] = 1
snpEff_per_cluster$Ranked_effect_strength[snpEff_per_cluster$Ranked_effect_strength == "MODERATE"] = 2
snpEff_per_cluster$Ranked_effect_strength[snpEff_per_cluster$Ranked_effect_strength == "LOW"] = 3
snpEff_per_cluster$Ranked_effect_strength[snpEff_per_cluster$Ranked_effect_strength == "MODIFIER"] = 4
snpEff_per_cluster$Ranked_effect_strength[is.na(snpEff_per_cluster$Effect_strength)] = 5
snpEff_per_cluster$Effect_strength[is.na(snpEff_per_cluster$Effect_strength)] = "NO_EFFECT"

snpEff_per_cluster = snpEff_per_cluster %>%
  group_by(CHR, BP) %>% 
  dplyr::mutate(ranktemp = rank(Ranked_effect_strength, ties.method = "random")) %>%
  filter(ranktemp == 1) 

list_snpEff[[K]] = ungroup(snpEff_per_cluster) %>%
  dplyr::count(Effect_strength) %>%
  dplyr::mutate(Cluster = K)
}
summary_snpEff_per_cluster = bind_rows(list_snpEff) 
write_tsv(summary_snpEff_per_cluster, paste0(Sumstats_dir, "Summary_snpEff_per_cluster.tsv"))
summary_snpEff_per_cluster = read_tsv(paste0(Sumstats_dir, "Summary_snpEff_per_cluster.tsv"))

summary_snpEff_per_cluster %>%
  ggplot(aes(x = as.factor(Cluster), y = n, fill =Effect_strength)) +
  geom_bar(stat = "identity") +
  theme_bw()

summary_snpEff_per_cluster %>%
  group_by(Cluster) %>%
  dplyr::mutate(total = sum(n),
                prop = n/total) %>%
  ggplot(aes(x = Effect_strength, y = prop, fill = as.factor(Cluster))) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_bw()

summary_snpEff_per_cluster %>%
  group_by(Cluster) %>%
  dplyr::mutate(total = sum(n),
                prop = n/total) %>%
  filter(Effect_strength != "MODIFIER") %>%
  ggplot(aes(x = as.factor(Cluster), y = prop, fill = as.factor(Effect_strength))) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_bw()

summary_snpEff_per_cluster %>%
  group_by(Cluster)  %>%
  dplyr::mutate(total = sum(n)) %>%
  filter(Effect_strength != "MODIFIER") %>%
  filter(Effect_strength != "LOW") %>%
    mutate(prop = n/total) %>%
  ggplot(aes(x = as.factor(Cluster), y = prop, fill = as.factor(Effect_strength))) +
  geom_bar(stat = "identity") +
  theme_bw()

Summary statistics of divergence

#for i in  {1..11} ; do for j in {1..11} ; do  python Divergence_with_scikit-allel_Fst.py --vcf_file ../1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp.recode.vcf  --sample_list1 ../2_Population_structure/Sample_list_V${i}.args --sample_list2 ../2_Population_structure/Sample_list_V${j}.args --out_dir ../2_Population_structure/Divergence/ --no_header ; done ; done

# echo -e "Subset1\tSubset2\tHudsons_Fst\tWeir_Cockerham_Fst" > \
 # ../2_Population_structure/Divergence/Divergence.Fst_between_clusters.tsv
# cat ../2_Population_structure/Divergence/Divergence.Fst.Sample_list_V*_vs_Sample_list_V*.tsv >>  \
 # ../2_Population_structure/Divergence/Divergence.Fst_between_clusters.tsv
ordering_table = tibble(Clusters = c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11"),
                        Order_tree = c(4, 10, 9, 6, 7, 5, 1, 3, 2, 11, 8))

Fst_values = read_tsv(paste0(PopStr_dir, "Divergence.Fst_between_clusters.tsv"))
temp = Fst_values %>% dplyr::select(-Hudsons_Fst) %>%
  mutate(Weir_Cockerham_Fst = ifelse(Weir_Cockerham_Fst < 0, 0, Weir_Cockerham_Fst),
         Subset1 = str_remove(Subset1, "Sample_list_"),
         Subset2 = str_remove(Subset2, "Sample_list_")) %>%
  left_join(., ordering_table, by = c("Subset1" = "Clusters")) %>%
  left_join(., ordering_table, by = c("Subset2" = "Clusters")) %>%
  arrange(Order_tree.x, Order_tree.y) %>%
  dplyr::select(-Order_tree.x, -Order_tree.y) %>%
  pivot_wider(names_from = Subset2, values_from = Weir_Cockerham_Fst)
WC_Fst_values <- as.matrix(temp[,-1])
rownames(WC_Fst_values) <- temp[,1] %>% pull()

corrplot(WC_Fst_values, method="circle", is.corr=FALSE, 
         tl.col = "black", tl.srt=45, tl.cex = 1)

         #type="upper")
library(geodist)
library(ggExtra)

#Estimate geographic distances between samples
temp = dplyr::select(Zt_meta, ID_file, Latitude, Longitude) %>%
  filter(!is.na(Latitude))
geo_distances = as.data.frame(geodist(x = temp, sequential = FALSE, measure = "geodesic")) 
colnames(geo_distances) <- temp$ID_file
geo_distances$ID_file1 = temp$ID_file
geo_distances = as.tibble(geo_distances) %>%
  pivot_longer(-ID_file1, names_to = "ID_file2", values_to = "Geo_distance")
  

#Run IBS with plink on cluster and import
# ${SOFTPATH}plink   --distance square ibs   --vcf ${vcf_dir}${VCFBasename}.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.vcf.gz   --out ${vcf_dir}${VCFBasename}.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80   --const-fid   --not-chr 14-21 mt
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.mibs.id ~/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/0_Nuclear_genome/All_good_samples.mibs.id
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.mibs ~/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/0_Nuclear_genome/All_good_samples.mibs

#../Software/vcftools_jydu/src/cpp/vcftools --gzvcf ${vcf_dir}${VCFBasename}.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.vcf.gz  --missing-indv  --out temp
#rsync -avP alice@130.125.25.244:/home/alice/WW_PopGen/temp.imiss ~/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/0_Nuclear_genome/All_good_samples.miss


#Read genetic distances
ibs = read_tsv(paste0(nuc_PS_dir, "All_good_samples.mibs.id"), col_names = c("Whatever", "ID_file1")) %>% 
  dplyr::select(ID_file1) %>% pull()
related = read_tsv(paste0(nuc_PS_dir, "All_good_samples.mibs"), col_names = ibs) %>%
  mutate(ID_file1 = ibs) %>%
  pivot_longer(names_to = "ID_file2", values_to = "Relatedness", cols = -ID_file1) 
distances = inner_join(related, geo_distances) 

#To do: compare intra-continent and inter-continent
#To do: compare intra-cluster and inter-cluster
#
distances = distances %>% 
  inner_join(Zt_meta %>% dplyr::select(ID_file1 = ID_file, Continent1 = Continent)) %>% 
  inner_join(Zt_meta %>% dplyr::select(ID_file2 = ID_file, Continent2 = Continent)) %>%
  mutate(Continent_comparison = ifelse(Continent1 == Continent2, "Intra", "Inter")) %>% 
  filter(Continent1 != "Asia") %>%
  filter(ID_file1 != ID_file2) 


#Dot plots
distances  %>%
  filter(Continent_comparison == "Intra") %>%
  ggplot(aes(x = Geo_distance, y = Relatedness, col = Continent1)) + 
  geom_point(shape = 1, alpha = 0.6) + 
  theme_bw() +
  facet_wrap(vars(Continent1), scales = "free") +
  Color_Continent + 
  stat_smooth(col = "grey20", method = "lm")#, formula = "y ~ log(x)")

# Discretized distances
temp = distances %>%
  filter(Continent_comparison == "Intra") %>%
  filter(Geo_distance < 4000000) %>%
  mutate(Geo_distance_disc = as.factor(500*round(Geo_distance/500000))) %>%
  dplyr::count(Continent1, Geo_distance_disc) #%>%
#filter(n >= 100)

distances %>%
  filter(Continent_comparison == "Intra") %>%
  mutate(Geo_distance_disc = as.factor(500*round(Geo_distance/500000)))%>%
 # inner_join(temp) %>%
  filter(Geo_distance < 4000000) %>%
  ggplot(aes(x = Geo_distance_disc, y = Relatedness, col = Continent1)) +
  geom_boxplot(outlier.shape = 1) + 
  theme_bw() +
  facet_wrap(vars(Continent1), scales = "free_y") +
  Color_Continent  +
  geom_text(data = temp, aes(x = Geo_distance_disc, y = 1, label = n), 
            col = "black", angle = 90, size = 4) +
  theme(axis.text.x = element_text(angle = 45, hjust =1, vjust = 1))

temp = list()
for (i in c(1:chosen_K)){
  temp[[i]] = read_tsv(paste0(PopStr_dir, "Sample_list_V", i, ".args"), col_names = "ID_file") %>%
    mutate( Cluster  = paste0("V", i)) }

list_pure_cluster = bind_rows(temp)

distances = distances %>% 
  inner_join(list_pure_cluster %>% dplyr::select(ID_file1 = ID_file, Cluster1 = Cluster)) %>% 
  inner_join(list_pure_cluster %>% dplyr::select(ID_file2 = ID_file, Cluster2 = Cluster)) %>%
  mutate(Cluster_comparison = ifelse(Cluster1 == Cluster2, "Intra", "Inter")) 




distances %>%
  filter(Cluster_comparison == "Intra") %>%
 # inner_join(temp) %>%
#  filter(Geo_distance < 4000000) %>%
  ggplot(aes(x = Geo_distance, y = Relatedness, col = Cluster1)) +
  geom_point() + 
  theme_bw() +
  facet_wrap(vars(Cluster1), scales = "free")  +
  theme(axis.text.x = element_text(angle = 45, hjust =1, vjust = 1))+ 
  stat_smooth(col = "grey20", method = "lm")

# Discretized distances
distances %>%
  filter(Cluster_comparison == "Intra") %>%
  mutate(Geo_distance_disc = as.factor(500*round(Geo_distance/500000)))%>%
 # inner_join(temp) %>%
 # filter(Geo_distance < 4000000) %>%
  ggplot(aes(x = Geo_distance_disc, y = Relatedness, col = Cluster1)) +
  geom_boxplot(outlier.shape = 1) + 
  theme_bw() +
  facet_wrap(vars(Cluster1), scales = "free_y") +
  #geom_text(data = temp, aes(x = Geo_distance_disc, y = 1, label = n), 
  #          col = "black", angle = 90, size = 4) +
  theme(axis.text.x = element_text(angle = 45, hjust =1, vjust = 1))

distances %>%
  filter(Cluster_comparison == "Intra") %>%
  mutate(Geo_distance_disc = as.factor(100*round(Geo_distance/100000)))%>%
 # inner_join(temp) %>%
  filter(Geo_distance < 1000000) %>%
  ggplot(aes(x = Geo_distance_disc, y = Relatedness, col = Cluster1)) +
  geom_boxplot(outlier.shape = 1) + 
  theme_bw() +
  facet_wrap(vars(Cluster1), scales = "free_y") +
  #geom_text(data = temp, aes(x = Geo_distance_disc, y = 1, label = n), 
  #          col = "black", angle = 90, size = 4) +
  theme(axis.text.x = element_text(angle = 45, hjust =1, vjust = 1))

distances %>%
  filter(Cluster_comparison == "Intra") %>%
  mutate(Geo_distance_disc = as.factor(50*round(Geo_distance/50000)))%>%
 # inner_join(temp) %>%
  filter(Geo_distance < 500000) %>%
  ggplot(aes(x = Geo_distance_disc, y = Relatedness, col = Cluster1)) +
  geom_boxplot(outlier.shape = 1) + 
  theme_bw() +
  facet_wrap(vars(Cluster1), scales = "free_y") +
  #geom_text(data = temp, aes(x = Geo_distance_disc, y = 1, label = n), 
  #          col = "black", angle = 90, size = 4) +
  theme(axis.text.x = element_text(angle = 45, hjust =1, vjust = 1))

# Comparing fit of models in Europe

set.seed(123)
subset_distances = distances %>% filter(Continent1 == "Europe") %>%
  filter(Continent_comparison == "Intra") 
training.samples <- distances$Relatedness %>%
  createDataPartition(p = 0.8, list = FALSE)
train.data  <- subset_distances[training.samples, ]
test.data <- subset_distances[-training.samples, ]

# Build the model for linear regression
model <- lm(Relatedness ~ Geo_distance, data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
  RMSE = RMSE(predictions, test.data$Relatedness),
  R2 = R2(predictions, test.data$Relatedness)
)
##          RMSE         R2
## 1 0.002795046 0.03554424
# Build the model for log regression
model <- lm(Relatedness ~ log(Geo_distance + 1), data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
  RMSE = RMSE(predictions, test.data$Relatedness),
  R2 = R2(predictions, test.data$Relatedness)
)
##          RMSE         R2
## 1 0.002819238 0.01878363
# Build the model
model <- gam(Relatedness ~ s(Geo_distance), data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
  RMSE = RMSE(predictions, test.data$Relatedness),
  R2 = R2(predictions, test.data$Relatedness)
)
##          RMSE         R2
## 1 0.002748006 0.06772863
ggplot(train.data, aes(Geo_distance, Relatedness) ) +
  geom_point() +
  stat_smooth(method = gam, formula = y ~ s(x))

ggplot(train.data, aes(Geo_distance, Relatedness) ) +
  geom_hex() 

#

temp = distances %>% filter(Continent1 == "Europe") %>% 
  filter(Continent2 == "Europe") %>% 
   mutate(Relat_group = ifelse(Relatedness > 0.935, "High", ifelse(Relatedness < 0.925, "Low", "Medium"))) %>% 
  inner_join(Zt_meta %>% dplyr::select(ID_file1 = ID_file, Country1 = Country, Year1 = Sampling_Date)) %>% 
  inner_join(Zt_meta %>% dplyr::select(ID_file2 = ID_file, Country2 = Country, Year2 = Sampling_Date)) %>%
  dplyr::select(-c("Continent1", "Continent2", "Continent_comparison", "Cluster1", "Cluster2"))


ungroup(temp) %>% group_by(Relat_group, Year1, Year2) %>%
  dplyr::count() %>%
  ggplot(aes(x = Year1, y = Year2, size = n, col = Relat_group))+
  geom_abline(slope = 1, intercept = 0, col = "grey30") +
  geom_point(alpha = .4) + 
  theme_bw() + facet_wrap(vars(Relat_group)) 

ungroup(temp) %>% group_by(Relat_group, Country1, Country2) %>%
  dplyr::count() %>%
  ggplot(aes(x = Country1, y = Country2, size = n, col = Relat_group))+
  geom_abline(slope = 1, intercept = 0, col = "grey30") +
  geom_point(alpha = .4) + 
  theme_bw() + facet_wrap(vars(Relat_group))

p1 = bind_rows(ungroup(temp) %>% dplyr::select(Relatedness, ID_file = ID_file1),
          ungroup(temp) %>% dplyr::select(Relatedness, ID_file = ID_file2))%>% 
  group_by(ID_file) %>% 
  dplyr::summarize(median_rel = median(Relatedness), ave_rel = mean(Relatedness)) %>% 
  dplyr::mutate(median_rel_cat = ifelse(median_rel > 0.935, "High", ifelse(median_rel < 0.925, "Low", "Medium"))) %>%
  full_join(read_tsv(paste0(nuc_PS_dir, "All_good_samples.miss")), by = c("ID_file" = "INDV"))

ggplot(data = p1, aes(median_rel, F_MISS)) + #TODO: compare with missing data...
  geom_point(alpha = .4) + 
    geom_text(data = p1 %>% filter(median_rel_cat != "Medium"), 
              aes (median_rel, F_MISS, label = ID_file)) 

inner_join(p1, dplyr::select(Zt_meta, Latitude, Longitude, ID_file)) %>%
  dplyr::filter(median_rel_cat != "Medium") %>%
  ggplot(aes(x = Latitude, y = Longitude, col = median_rel_cat)) +
  geom_point() +
  theme_bw()

distances = distances %>% 
  inner_join(Zt_meta %>% dplyr::select(ID_file1 = ID_file, Year1 = Sampling_Date)) %>% 
  inner_join(Zt_meta %>% dplyr::select(ID_file2 = ID_file, Year2 = Sampling_Date)) %>%
  filter(!is.na(Year1) & !is.na(Year2)) %>%
  mutate(Time_diff = abs(Year1 - Year2))

samples_to_exclude = p1 %>% filter(median_rel_cat != "Medium") %>% pull(ID_file)

p2 = distances %>% filter(Continent1 == "Europe") %>%
  filter(!(ID_file1 %in% samples_to_exclude)) %>%
  filter(!(ID_file2 %in% samples_to_exclude)) %>%
  filter(Continent_comparison == "Intra") %>%
  ggplot(aes(Geo_distance, Relatedness, col = Time_diff) ) +
  geom_point(alpha = .4) +
  coord_cartesian(ylim = c(0.91, 0.975))

p3 = distances %>% filter(Continent1 == "Europe") %>%
  filter(Continent_comparison == "Intra") %>%
  ggplot(aes(Geo_distance, Relatedness, col = Time_diff) ) +
  geom_point(alpha = .4) +
  coord_cartesian(ylim = c(0.91, 0.975))

cowplot::plot_grid(p2, p3, ncol = 2)

# Model
subset_distances = distances %>% filter(Continent1 == "Europe") %>%
  filter(Continent_comparison == "Intra") %>%
  filter(!(ID_file1 %in% samples_to_exclude)) %>%
  filter(!(ID_file2 %in% samples_to_exclude)) 
training.samples <- subset_distances$Relatedness %>%
  createDataPartition(p = 0.8, list = FALSE)
train.data  <- subset_distances[training.samples, ]
test.data <- subset_distances[-training.samples, ]

# Build the model for linear regression
model <- lm(Relatedness ~ Geo_distance, data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
  RMSE = RMSE(predictions, test.data$Relatedness),
  R2 = R2(predictions, test.data$Relatedness)
)
##          RMSE         R2
## 1 0.001056302 0.01144103
library(mgcv)
# Build the model
model <- gam(Relatedness ~ s(Geo_distance), data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
  RMSE = RMSE(predictions, test.data$Relatedness),
  R2 = R2(predictions, test.data$Relatedness)
)
##          RMSE       R2
## 1 0.001033025 0.054408
ggplot(train.data, aes(Geo_distance, Relatedness) ) +
  geom_point(alpha = .2) +
  stat_smooth(method = gam, formula = y ~ s(x)) +
  coord_cartesian(ylim = c(0.92, 0.94)) +
  theme_bw()



Wheat trade and wheat production across the world

# Get area for country, wheat culture and percentage of land dedicated to wheat
temp = read_csv(paste0(metadata_dir, "FAOSTAT_data_4-15-2021_land_stats.csv"), 
                col_types = "ccdcdcdcddcdc")
wheat_per_country = read_csv(paste0(metadata_dir, "FAOSTAT_data_4-9-2021_crops_production.csv")) %>% 
  filter(Unit == "ha") %>% 
  dplyr::select(Area, `Year Code`, Value)  %>%
  group_by(Area) %>%
  summarize(Wheat_ha = mean(Value)) %>%
  inner_join(.,  filter(temp, Item == "Arable land") %>%
               mutate(Arable_land_area_in_ha = Value * 1000) %>%
               dplyr::select(Area, Arable_land_area_in_ha)) %>%
  mutate(Arable_land_for_wheat = Wheat_ha * 100 / Arable_land_area_in_ha) %>%
  inner_join(., filter(temp, Item == "Land area") %>%
               mutate(Land_area_in_ha = Value * 1000) %>%
               dplyr::select(Area, Land_area_in_ha)) %>%
  mutate(Land_for_wheat = Wheat_ha * 100 / Land_area_in_ha) 

wheat_per_state = read_tsv(paste0(metadata_dir, "USDA_crop_production_2017-2019.txt")) %>%
  mutate(Land_area_in_ha = `Land area` * 1000)  %>% 
  dplyr::select(Area, Year, Wheat_ha, Land_area_in_ha)  %>%
  group_by(Area) %>%
  summarize(Wheat_ha = mean(Wheat_ha), Land_area_in_ha = mean(Land_area_in_ha)) %>%
  mutate(Land_for_wheat = Wheat_ha * 100 / Land_area_in_ha)

wheat_per_country = bind_rows(wheat_per_country, wheat_per_state) 

#Normalize country names
correspondence_table = readxl::read_excel(path = paste0(metadata_dir, "FAO_stats_map_corres.xlsx"), sheet = "Summary")
for (i in c(1:nrow(correspondence_table))) {
  print(pull(correspondence_table[i, 2])) 
  print(pull(correspondence_table[i, 1]))
  wheat_per_country =  wheat_per_country %>% 
    mutate_at(vars(contains('Area')), 
              funs(str_replace(., pull(correspondence_table[i, 2]), pull(correspondence_table[i, 1]))))
}
## [1] "Bolivia (Plurinational State of)"
## [1] "Bolivia"
## [1] "Brunei Darussalam"
## [1] "Brunei"
## [1] "Czechia"
## [1] "Czech Republic"
## [1] "French Guyana"
## [1] "French Guiana"
## [1] "Iran (Islamic Republic of)"
## [1] "Iran"
## [1] "Côte d'Ivoire"
## [1] "Ivory Coast"
## [1] "North Macedonia"
## [1] "Macedonia"
## [1] "Melanesia"
## [1] "Mayotte"
## [1] "Republic of Moldova"
## [1] "Moldova"
## [1] "Democratic People's Republic of Korea"
## [1] "North Korea"
## [1] "Congo"
## [1] "Republic of Congo"
## [1] "Réunion"
## [1] "Reunion"
## [1] "Russian Federation"
## [1] "Russia"
## [1] "Saint Kitts and Nevis"
## [1] "Saint Kitts"
## [1] "Saint Vincent and the Grenadines"
## [1] "Saint Vincent"
## [1] "Republic of Korea"
## [1] "South Korea"
## [1] "Syrian Arab Republic"
## [1] "Syria"
## [1] "United Republic of Tanzania"
## [1] "Tanzania"
## [1] "Trinidad and Tobago"
## [1] "Trinidad"
## [1] "United Kingdom of Great Britain and Northern Ireland"
## [1] "UK"
## [1] "United States of America"
## [1] "USA"
## [1] "Venezuela (Bolivarian Republic of)"
## [1] "Venezuela"
#Maps 
world2 = left_join(world, wheat_per_country, by = c("region" = "Area"))

temp = Zt_meta %>%
   dplyr::count(Country, Latitude, Longitude, name = "Number_genomes") %>%
   filter(Number_genomes > 0)

ggplot() + theme_void() +
  geom_polygon(data = world2, aes(x=long, y = lat, group = group, fill = Land_for_wheat), alpha=0.7) +
  scale_size("Number of genomes", limits = c(1, max_circle)) +
  scale_fill_gradient(low = "#FEEDD8", high = "goldenrod", na.value="#F7F4F3")

#rsync -avP  alice@130.125.25.244:/data2/alice/WW_project/3_Sumstats_demography/Sample_list_V*_*_diversity_pi.tsv \
#    ~/Documents/Postdoc_Bruce/Projects/WW_project/3_Sumstats_demography/

echo -e "Subset_samples\tChromosome\tStart\tStop\tPi\tTajimaD\tTheta" > \
  ../3_Sumstats_demography/Diversity_per_country.tsv
for x in Algeria Argentina Australia Australia_NSW Australia_Tasmania Belgium Canada Chile Czech_Republic Denmark France Germany Iran Ireland Israel Netherlands New_Zealand Switzerland Tunisia Turkey UK USA_Indiana USA_Missouri USA_Oregon USA_Texas Uruguay; 
  do
  cat ../3_Sumstats_demography/Sample_list_${x}_diversity_pi.tsv >>  \
    ../3_Sumstats_demography/Diversity_per_country.tsv
done
#Setting countries and windows
temp_countries = c("Algeria", "Argentina", "Australia", "Belgium", "Canada", "Chile", "Czech_Republic", 
                   "Denmark", "France", "Germany", "Iran", "Ireland", "Israel", "Netherlands", 
                   "New_Zealand", "Switzerland", "Tunisia", "Turkey", "UK",
                   "USA_Indiana", "USA_Missouri", "USA_Oregon", "USA_Texas", "Uruguay")

diversity_values = read_tsv(paste0(Sumstats_dir, "Diversity_per_country.tsv"), na = "nan")

temp = diversity_values %>% dplyr::select(-Theta, -TajimaD) %>%
  mutate(Country = str_remove(Subset_samples, "Sample_list_")) %>%
  group_by(Country) %>%
  summarize(Pi = median(Pi, na.rm = T)) 
temp = countries %>%
  mutate(Country = ifelse(Country == "Czech Republic", "Czech_Republic", Country))%>%
  mutate(Country = ifelse(Country == "New Zealand", "New_Zealand", Country)) %>%
  right_join(., temp)

summary_diversity = wheat_per_country %>%
  mutate(Area = ifelse(Area == "United States of America", "USA", Area)) %>%
  mutate(Area = ifelse(Area == "United Kingdom of Great Britain and Northern Ireland", "UK", Area)) %>%
  mutate(Area = ifelse(Area == "Czechia", "Czech_Republic", Area)) %>%
  mutate(Area = ifelse(Area == "New Zealand", "New_Zealand", Area)) %>%
  mutate(Area = ifelse(Area == "Iran (Islamic Republic of)", "Iran", Area)) %>%
  inner_join(., temp, by = c("Area" = "Country")) 

p3 = pivot_longer(summary_diversity, cols = c(Arable_land_for_wheat, Land_for_wheat), 
               names_to = "Wheat_qty_est", values_to = "Percent") %>%
  ggplot(aes(x = Pi, y = Percent, label = Area, col = Continent)) +
  geom_text() + 
  theme_bw() +
  facet_wrap(vars(Wheat_qty_est), scales = "free_y") +
  Color_Continent + 
  xlim(c(0.004, 0.012))



p1 = filter(summary_diversity, Continent == "Europe") %>%
  ggplot(aes(x = Pi, y = Land_for_wheat, label = Area, col = Continent)) +
  geom_text() + 
  theme_bw() +
  Color_Continent

p2 = filter(summary_diversity, Continent != "Europe") %>%
  ggplot(aes(x = Pi, y = Land_for_wheat, label = Area, col = Continent)) +
  geom_text() + 
  theme_bw() +
  Color_Continent

row = cowplot::plot_grid(p1, p2, nrow = 1) 
cowplot::plot_grid(p3, row, ncol = 1)

model1 = lm(Pi ~ Land_for_wheat + Continent, data = summary_diversity)
summary(model1)
## 
## Call:
## lm(formula = Pi ~ Land_for_wheat + Continent, data = summary_diversity)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -2.869e-03 -2.051e-04  7.367e-05  5.061e-04  1.481e-03 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             1.155e-02  8.345e-04  13.838 6.03e-10 ***
## Land_for_wheat          1.088e-04  1.017e-04   1.070 0.301376    
## ContinentEurope        -1.358e-03  9.203e-04  -1.476 0.160715    
## ContinentMiddle East    2.215e-05  1.059e-03   0.021 0.983583    
## ContinentNorth America -4.427e-03  9.389e-04  -4.715 0.000276 ***
## ContinentOceania       -4.549e-03  1.376e-03  -3.305 0.004808 ** 
## ContinentSouth America -3.776e-03  1.018e-03  -3.711 0.002090 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001101 on 15 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.8273, Adjusted R-squared:  0.7582 
## F-statistic: 11.97 on 6 and 15 DF,  p-value: 5.53e-05
model2 = lm(Pi ~ Land_for_wheat + Continent + Land_for_wheat*Continent, data = summary_diversity)
summary(model2)
## 
## Call:
## lm(formula = Pi ~ Land_for_wheat + Continent + Land_for_wheat * 
##     Continent, data = summary_diversity)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0016415 -0.0000839  0.0000077  0.0002099  0.0008036 
## 
## Coefficients: (1 not defined because of singularities)
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            1.189e-02  7.882e-04  15.079 1.08e-08
## Land_for_wheat                        -5.682e-06  2.141e-04  -0.027 0.979300
## ContinentEurope                       -1.177e-03  9.502e-04  -1.238 0.241328
## ContinentMiddle East                  -9.506e-04  1.101e-03  -0.864 0.406187
## ContinentNorth America                 1.068e-03  6.020e-03   0.177 0.862384
## ContinentOceania                      -4.859e-03  1.000e-03  -4.857 0.000505
## ContinentSouth America                -9.245e-03  1.325e-03  -6.975 2.34e-05
## Land_for_wheat:ContinentEurope         2.629e-05  2.288e-04   0.115 0.910595
## Land_for_wheat:ContinentMiddle East    2.166e-04  2.393e-04   0.905 0.384833
## Land_for_wheat:ContinentNorth America -4.866e-03  5.093e-03  -0.955 0.359903
## Land_for_wheat:ContinentOceania               NA         NA      NA       NA
## Land_for_wheat:ContinentSouth America  3.771e-03  7.398e-04   5.097 0.000346
##                                          
## (Intercept)                           ***
## Land_for_wheat                           
## ContinentEurope                          
## ContinentMiddle East                     
## ContinentNorth America                   
## ContinentOceania                      ***
## ContinentSouth America                ***
## Land_for_wheat:ContinentEurope           
## Land_for_wheat:ContinentMiddle East      
## Land_for_wheat:ContinentNorth America    
## Land_for_wheat:ContinentOceania          
## Land_for_wheat:ContinentSouth America ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.000666 on 11 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.9537, Adjusted R-squared:  0.9115 
## F-statistic: 22.64 on 10 and 11 DF,  p-value: 6.426e-06
temp = read_tsv(paste0(Sumstats_dir, "Summary_diversity_pi.tsv"), 
         col_names = c("Subset_size", "Country", "Year", "Lat", "Long", "Repeat", "Mean_pi", "Median_pi")) %>%
  unite(col = "Place", Country, Year, Lat, Long, remove = FALSE) %>%
  #mutate(Country = ifelse(str_detect(Country, "^USA"), "USA", Country)) %>%
  mutate(Country = ifelse(str_detect(Country, "^Australia"), "Australia", Country))%>%
  left_join(Zt_meta %>% dplyr::select(Country, Continent) %>% distinct()) %>%
  mutate(Continent = ifelse(Country == "Australia", "Oceania", Continent)) %>%
  mutate(Continent = ifelse(Country == "NewZealand", "Oceania", Continent)) %>%
  mutate(Continent = ifelse(Country == "USA", "North America", Continent)) 

temp %>%
  ggplot(aes(x = Place, y = Median_pi)) +
  geom_point(col = "goldenrod", alpha = .6) +
  facet_wrap(vars(Subset_size)) +
  coord_flip() + theme_bw()

summary_diversity2 = wheat_per_country %>%
  #mutate(Area = ifelse(Area == "United States of America", "USA", Area)) %>%
  mutate(Area = ifelse(Area == "United Kingdom of Great Britain and Northern Ireland", "UK", Area)) %>%
  mutate(Area = ifelse(Area == "Czechia", "Czech_Republic", Area)) %>%
  mutate(Area = ifelse(Area == "New Zealand", "NewZealand", Area)) %>%
  mutate(Area = ifelse(Area == "Iran (Islamic Republic of)", "Iran", Area)) %>%
  inner_join(., filter(temp, Subset_size == 6) %>% group_by(Country, Continent) %>% summarize(Median_pi = median(Median_pi)), 
             by = c("Area" = "Country")) 

pivot_longer(summary_diversity2, cols = c(Arable_land_for_wheat, Land_for_wheat, Wheat_ha), 
               names_to = "Wheat_qty_est", values_to = "Land_use") %>%
  ggplot(aes(x = Median_pi, y = Land_use, label = Area, col = Continent)) +
  geom_text() + 
  theme_bw() +
  facet_wrap(vars(Wheat_qty_est), scales = "free_y") +
  Color_Continent 

filter(summary_diversity2, Continent == "Europe") %>%
  ggplot(aes(x = Median_pi, y = Arable_land_for_wheat, label = Area, col = Continent)) +
  geom_text() + 
  theme_bw()  +
  Color_Continent 

#Read and correct capitals for coordinates per country
wheat_trade = read_csv(paste0(metadata_dir, "FAOSTAT_data_4-9-2021_detailed_trade_matrix.csv")) 

correspondence_table = readxl::read_excel(path = paste0(metadata_dir, "FAO_stats_map_corres.xlsx"), sheet = "Summary")
for (i in c(1:nrow(correspondence_table))) {
  print(pull(correspondence_table[i, 2])) 
  print(pull(correspondence_table[i, 1]))
  wheat_trade =  wheat_trade %>% 
    mutate_at(vars(contains('Countries')), 
              funs(str_replace(., pull(correspondence_table[i, 2]), pull(correspondence_table[i, 1]))))
}
## [1] "Bolivia (Plurinational State of)"
## [1] "Bolivia"
## [1] "Brunei Darussalam"
## [1] "Brunei"
## [1] "Czechia"
## [1] "Czech Republic"
## [1] "French Guyana"
## [1] "French Guiana"
## [1] "Iran (Islamic Republic of)"
## [1] "Iran"
## [1] "Côte d'Ivoire"
## [1] "Ivory Coast"
## [1] "North Macedonia"
## [1] "Macedonia"
## [1] "Melanesia"
## [1] "Mayotte"
## [1] "Republic of Moldova"
## [1] "Moldova"
## [1] "Democratic People's Republic of Korea"
## [1] "North Korea"
## [1] "Congo"
## [1] "Republic of Congo"
## [1] "Réunion"
## [1] "Reunion"
## [1] "Russian Federation"
## [1] "Russia"
## [1] "Saint Kitts and Nevis"
## [1] "Saint Kitts"
## [1] "Saint Vincent and the Grenadines"
## [1] "Saint Vincent"
## [1] "Republic of Korea"
## [1] "South Korea"
## [1] "Syrian Arab Republic"
## [1] "Syria"
## [1] "United Republic of Tanzania"
## [1] "Tanzania"
## [1] "Trinidad and Tobago"
## [1] "Trinidad"
## [1] "United Kingdom of Great Britain and Northern Ireland"
## [1] "UK"
## [1] "United States of America"
## [1] "USA"
## [1] "Venezuela (Bolivarian Republic of)"
## [1] "Venezuela"
our_wheat_trade = wheat_trade %>%
  filter(Element == "Import Quantity" & Unit == "tonnes") %>%
  group_by(`Reporter Countries`, `Partner Countries`) %>%
  dplyr::select(`Reporter Countries`, `Partner Countries`, Value) %>%
  summarize(Average_value = mean(Value))


#Read and correct capitals for coordinates per country
capital = read_csv(paste0(metadata_dir, "simple_maps_worldcities.csv")) %>% 
  filter(capital == "primary") %>%
    group_by(country) %>%
    mutate(rank = rank(population, ties.method = "random"))  %>%
    filter(rank == 1)
correspondence_table = readxl::read_excel(path = paste0(metadata_dir, "FAO_stats_map_corres.xlsx"), sheet = "Summary_capital")
for (i in c(1:nrow(correspondence_table))) {
  print(pull(correspondence_table[i, 2])) 
  print(pull(correspondence_table[i, 1]))
  capital =  capital %>% 
    mutate_at(vars(contains('country')), 
              funs(str_replace(., pull(correspondence_table[i, 2]), pull(correspondence_table[i, 1]))))
}
## [1] "Antigua And Barbuda"
## [1] "Antigua"
## [1] "Bahamas, The"
## [1] "Bahamas"
## [1] "Bosnia And Herzegovina"
## [1] "Bosnia and Herzegovina"
## [1] "Curaçao"
## [1] "Curacao"
## [1] "Czechia"
## [1] "Czech Republic"
## [1] "Congo (Kinshasa)"
## [1] "Democratic Republic of the Congo"
## [1] "Falkland Islands (Islas Malvinas)"
## [1] "Falkland Islands"
## [1] "Gambia, The"
## [1] "Gambia"
## [1] "Isle Of Man"
## [1] "Isle of Man"
## [1] "Côte D’Ivoire"
## [1] "Ivory Coast"
## [1] "Micronesia, Federated States Of"
## [1] "Micronesia"
## [1] "Congo (Brazzaville)"
## [1] "Republic of Congo"
## [1] "Saint Helena, Ascension, And Tristan Da Cunha"
## [1] "Saint Helena"
## [1] "Saint Kitts And Nevis"
## [1] "Saint Kitts"
## [1] "Saint Vincent And The Grenadines"
## [1] "Saint Vincent"
## [1] "Sao Tome And Principe"
## [1] "Sao Tome and Principe"
## [1] "South Georgia And South Sandwich Islands"
## [1] "South Georgia"
## [1] "Korea, South"
## [1] "South Korea"
## [1] "Trinidad And Tobago"
## [1] "Tobago"
## [1] "Turks And Caicos Islands"
## [1] "Turks and Caicos Islands"
## [1] "United Kingdom"
## [1] "UK"
## [1] "United States"
## [1] "USA"
## [1] "Vatican City"
## [1] "Vatican"
## [1] "Wallis And Futuna"
## [1] "Wallis and Futuna"
temp = inner_join(our_wheat_trade, capital %>%
                   dplyr::select(country, yend = lat, xend = lng),
                   by = c("Reporter Countries" = "country")) %>%
        inner_join(., capital %>%
                   dplyr::select(country, y = lat, x = lng),
                   by = c("Partner Countries" = "country")) 

temp2 = filter(temp,`Partner Countries` != `Reporter Countries`) %>% 
  filter(Average_value > 10000) %>% distinct()


#
ggplot() + theme_void() +
  geom_polygon(data = world2, 
               aes(x=long, y = lat, group = group, fill = Land_for_wheat), 
               alpha=0.7) +
  scale_fill_gradient(low = "#FEEDD8", high = "#faa307", na.value="#F7F4F3") +
  geom_curve(data = temp2, aes(x = x, y = y, xend = xend, yend = yend, 
                               size = Average_value, alpha = Average_value), 
             curvature = 0.2, color = "#432818") +
  scale_size_continuous(guide = FALSE, range = c(0.02, 0.5))